--- /mnt/winc/gas/vf_spp0.c Sun Apr 24 19:40:20 2005 +++ /mnt/winc/gas/vf_spp.c Sun Apr 24 19:39:18 2005 @@ -21,6 +21,8 @@ * "Aria Nosratinia Embedded Post-Processing for * Enhancement of Compressed Images (1999)" * (http://citeseer.nj.nec.com/nosratinia99embedded.html) + * Futher, with splitting (i)dct into hor/ver passes, one of them can be + * performed once per block, not pixel. This allows for much better speed. */ @@ -54,10 +56,19 @@ #include "vf.h" #include "../libvo/fastmemcpy.h" -#define XMIN(a,b) ((a) < (b) ? (a) : (b)) - //===========================================================================// -static const uint8_t __attribute__((aligned(8))) dither[8][8]={ +static const short custom_threshold[64]= +{ 71, 972, 295, 237, 71, 40, 38, 19, + 245, 193, 185, 121, 102, 73, 53, 27, + 158, 129, 141, 107, 97, 73, 50, 26, + 102, 116, 109, 98, 82, 66, 45, 23, + 71, 94, 95, 81, 70, 56, 38, 20, + 56, 77, 74, 66, 56, 44, 30, 15, + 38, 53, 50, 45, 38, 30, 21, 11, + 20, 27, 26, 23, 20, 15, 11, 5 +}; + +static const uint8_t __attribute__((aligned(32))) dither[8][8]={ { 0, 48, 12, 60, 3, 51, 15, 63, }, { 32, 16, 44, 28, 35, 19, 47, 31, }, { 8, 56, 4, 52, 11, 59, 7, 55, }, @@ -68,252 +79,65 @@ { 42, 26, 38, 22, 41, 25, 37, 21, }, }; -static const uint8_t offset[127][2]= { -{0,0}, -{0,0}, {4,4}, -{0,0}, {2,2}, {6,4}, {4,6}, -{0,0}, {5,1}, {2,2}, {7,3}, {4,4}, {1,5}, {6,6}, {3,7}, - -{0,0}, {4,0}, {1,1}, {5,1}, {3,2}, {7,2}, {2,3}, {6,3}, -{0,4}, {4,4}, {1,5}, {5,5}, {3,6}, {7,6}, {2,7}, {6,7}, - -{0,0}, {0,2}, {0,4}, {0,6}, {1,1}, {1,3}, {1,5}, {1,7}, -{2,0}, {2,2}, {2,4}, {2,6}, {3,1}, {3,3}, {3,5}, {3,7}, -{4,0}, {4,2}, {4,4}, {4,6}, {5,1}, {5,3}, {5,5}, {5,7}, -{6,0}, {6,2}, {6,4}, {6,6}, {7,1}, {7,3}, {7,5}, {7,7}, - -{0,0}, {4,4}, {0,4}, {4,0}, {2,2}, {6,6}, {2,6}, {6,2}, -{0,2}, {4,6}, {0,6}, {4,2}, {2,0}, {6,4}, {2,4}, {6,0}, -{1,1}, {5,5}, {1,5}, {5,1}, {3,3}, {7,7}, {3,7}, {7,3}, -{1,3}, {5,7}, {1,7}, {5,3}, {3,1}, {7,5}, {3,5}, {7,1}, -{0,1}, {4,5}, {0,5}, {4,1}, {2,3}, {6,7}, {2,7}, {6,3}, -{0,3}, {4,7}, {0,7}, {4,3}, {2,1}, {6,5}, {2,5}, {6,1}, -{1,0}, {5,4}, {1,4}, {5,0}, {3,2}, {7,6}, {3,6}, {7,2}, -{1,2}, {5,6}, {1,6}, {5,2}, {3,0}, {7,4}, {3,4}, {7,0}, -}; +struct vf_priv_s { //align 16 ! + uint64_t threshold_mtx_noq[8*2]; + uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions -struct vf_priv_s { int log2_count; + int temp_stride; int qp; - int mode; int mpeg2; - int temp_stride; + int prev_q; uint8_t *src; int16_t *temp; - AVCodecContext *avctx; - DSPContext dsp; + //int mode; }; -#define SHIFT 22 - -static void hardthresh_c(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){ - int i; - int bias= 0; //FIXME - unsigned int threshold1, threshold2; - - threshold1= qp*((1<<4) - bias) - 1; - threshold2= (threshold1<<1); - - memset(dst, 0, 64*sizeof(DCTELEM)); - dst[0]= (src[0] + 4)>>3; - - for(i=1; i<64; i++){ - int level= src[i]; - if(((unsigned)(level+threshold1))>threshold2){ - const int j= permutation[i]; - dst[j]= (level + 4)>>3; - } - } -} - -static void softthresh_c(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){ - int i; - int bias= 0; //FIXME - unsigned int threshold1, threshold2; - - threshold1= qp*((1<<4) - bias) - 1; - threshold2= (threshold1<<1); - memset(dst, 0, 64*sizeof(DCTELEM)); - dst[0]= (src[0] + 4)>>3; - - for(i=1; i<64; i++){ - int level= src[i]; - if(((unsigned)(level+threshold1))>threshold2){ - const int j= permutation[i]; - if(level>0) - dst[j]= (level - threshold1 + 4)>>3; - else - dst[j]= (level + threshold1 + 4)>>3; - } - } -} - -#ifdef HAVE_MMX -static void hardthresh_mmx(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){ - int bias= 0; //FIXME - unsigned int threshold1; - - threshold1= qp*((1<<4) - bias) - 1; - - asm volatile( -#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ - "movq " #src0 ", %%mm0 \n\t"\ - "movq " #src1 ", %%mm1 \n\t"\ - "movq " #src2 ", %%mm2 \n\t"\ - "movq " #src3 ", %%mm3 \n\t"\ - "psubw %%mm4, %%mm0 \n\t"\ - "psubw %%mm4, %%mm1 \n\t"\ - "psubw %%mm4, %%mm2 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddusw %%mm5, %%mm0 \n\t"\ - "paddusw %%mm5, %%mm1 \n\t"\ - "paddusw %%mm5, %%mm2 \n\t"\ - "paddusw %%mm5, %%mm3 \n\t"\ - "paddw %%mm6, %%mm0 \n\t"\ - "paddw %%mm6, %%mm1 \n\t"\ - "paddw %%mm6, %%mm2 \n\t"\ - "paddw %%mm6, %%mm3 \n\t"\ - "psubusw %%mm6, %%mm0 \n\t"\ - "psubusw %%mm6, %%mm1 \n\t"\ - "psubusw %%mm6, %%mm2 \n\t"\ - "psubusw %%mm6, %%mm3 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "psraw $3, %%mm2 \n\t"\ - "psraw $3, %%mm3 \n\t"\ -\ - "movq %%mm0, %%mm7 \n\t"\ - "punpcklwd %%mm2, %%mm0 \n\t" /*A*/\ - "punpckhwd %%mm2, %%mm7 \n\t" /*C*/\ - "movq %%mm1, %%mm2 \n\t"\ - "punpcklwd %%mm3, %%mm1 \n\t" /*B*/\ - "punpckhwd %%mm3, %%mm2 \n\t" /*D*/\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm1, %%mm0 \n\t" /*A*/\ - "punpckhwd %%mm7, %%mm3 \n\t" /*C*/\ - "punpcklwd %%mm2, %%mm7 \n\t" /*B*/\ - "punpckhwd %%mm2, %%mm1 \n\t" /*D*/\ -\ - "movq %%mm0, " #dst0 " \n\t"\ - "movq %%mm7, " #dst1 " \n\t"\ - "movq %%mm3, " #dst2 " \n\t"\ - "movq %%mm1, " #dst3 " \n\t" - - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "movd %4, %%mm6 \n\t" - "packssdw %%mm4, %%mm4 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm4, %%mm4 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) - REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) - REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) - REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) - : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed? - ); - dst[0]= (src[0] + 4)>>3; -} +#ifndef HAVE_MMX -static void softthresh_mmx(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){ - int bias= 0; //FIXME - unsigned int threshold1; - - threshold1= qp*((1<<4) - bias) - 1; - - asm volatile( -#undef REQUANT_CORE -#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ - "movq " #src0 ", %%mm0 \n\t"\ - "movq " #src1 ", %%mm1 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "pcmpgtw %%mm0, %%mm6 \n\t"\ - "pcmpgtw %%mm1, %%mm7 \n\t"\ - "pxor %%mm6, %%mm0 \n\t"\ - "pxor %%mm7, %%mm1 \n\t"\ - "psubusw %%mm4, %%mm0 \n\t"\ - "psubusw %%mm4, %%mm1 \n\t"\ - "pxor %%mm6, %%mm0 \n\t"\ - "pxor %%mm7, %%mm1 \n\t"\ - "movq " #src2 ", %%mm2 \n\t"\ - "movq " #src3 ", %%mm3 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "pcmpgtw %%mm2, %%mm6 \n\t"\ - "pcmpgtw %%mm3, %%mm7 \n\t"\ - "pxor %%mm6, %%mm2 \n\t"\ - "pxor %%mm7, %%mm3 \n\t"\ - "psubusw %%mm4, %%mm2 \n\t"\ - "psubusw %%mm4, %%mm3 \n\t"\ - "pxor %%mm6, %%mm2 \n\t"\ - "pxor %%mm7, %%mm3 \n\t"\ -\ - "paddsw %%mm5, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm1 \n\t"\ - "paddsw %%mm5, %%mm2 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "psraw $3, %%mm2 \n\t"\ - "psraw $3, %%mm3 \n\t"\ -\ - "movq %%mm0, %%mm7 \n\t"\ - "punpcklwd %%mm2, %%mm0 \n\t" /*A*/\ - "punpckhwd %%mm2, %%mm7 \n\t" /*C*/\ - "movq %%mm1, %%mm2 \n\t"\ - "punpcklwd %%mm3, %%mm1 \n\t" /*B*/\ - "punpckhwd %%mm3, %%mm2 \n\t" /*D*/\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm1, %%mm0 \n\t" /*A*/\ - "punpckhwd %%mm7, %%mm3 \n\t" /*C*/\ - "punpcklwd %%mm2, %%mm7 \n\t" /*B*/\ - "punpckhwd %%mm2, %%mm1 \n\t" /*D*/\ -\ - "movq %%mm0, " #dst0 " \n\t"\ - "movq %%mm7, " #dst1 " \n\t"\ - "movq %%mm3, " #dst2 " \n\t"\ - "movq %%mm1, " #dst3 " \n\t" - - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "packssdw %%mm4, %%mm4 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm4, %%mm4 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) - REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) - REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) - REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) - : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed? - ); +//This func reads from 1 slice, 1 and clears 0 & 1 +static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale) +{int y, x, src_stridew; +#define STORE(pos) \ + temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale);\ + src[x + pos]=src[x + pos - 8*src_stride]=0;\ + if(temp & 0x100) temp= ~(temp>>31);\ + dst[x + pos]= temp; - dst[0]= (src[0] + 4)>>3; + src_stridew=src_stride-width; + dst_stride-=width; + for(y=0; y>6;\ +//This func reads from 2 slices, 0 & 2 and clears 2-nd +static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale) +{int y, x, src_stridew; +#define STORE2(pos) \ + temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale);\ + src[x + pos + 16*src_stride]=0;\ if(temp & 0x100) temp= ~(temp>>31);\ - dst[x + y*dst_stride + pos]= temp; + dst[x + pos]= temp; + src_stridew=src_stride-width; + dst_stride-=width; for(y=0; ythreshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C +} + +static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt); +static void row_idct_c(DCTELEM* workspace, + int16_t* output_adr, int output_stride, int cnt); +static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt); + +//this is rather ugly, but there is no need for function pointers +#define store_slice_s store_slice_c +#define store_slice2_s store_slice2_c +#define mul_thrmat_s mul_thrmat_c +#define column_fidct_s column_fidct_c +#define row_idct_s row_idct_c +#define row_fdct_s row_fdct_c + +#else //MMX functions + +//This func reads from 1 slice, 1 and clears 0 & 1 +static void store_slice_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale) +{const uint8_t *od=&dither[0][0]; + dst_stride-=width; + //src_stride=(src_stride-width)*2; + height=(int)(&dither[height][0]); asm volatile( - "movq (%3), %%mm3 \n\t" - "movq (%3), %%mm4 \n\t" - "movd %4, %%mm2 \n\t" - "pxor %%mm0, %%mm0 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - "punpckhbw %%mm0, %%mm4 \n\t" - "psraw %%mm2, %%mm3 \n\t" - "psraw %%mm2, %%mm4 \n\t" - "movd %5, %%mm2 \n\t" - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" + "movd %%edx, %%mm5 \n\t" + "xorl $-1, %%edx \n\t" + "movl %%eax, %%ecx \n\t" + "addl $7, %%edx \n\t" + "negl %%eax \n\t" + "subl %0, %%ecx \n\t" + "addl %%ecx, %%ecx \n\t" + "movd %%edx, %%mm2 \n\t" + "movl %%ecx, %1 \n\t" + "movl %2, %%edx \n\t" + "shll $4, %%eax \n\t" + +"L_2: \n\t" + "movq (%%edx), %%mm3 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "movl %0, %%ecx \n\t" + "psraw %%mm5, %%mm3 \n\t" + "psraw %%mm5, %%mm4 \n\t" +"L_1: \n\t" + "movq %%mm7, (%%esi,%%eax,) \n\t" + "movq (%%esi), %%mm0 \n\t" + "movq 8(%%esi), %%mm1 \n\t" + + "movq %%mm7, 8(%%esi,%%eax,) \n\t" "paddw %%mm3, %%mm0 \n\t" "paddw %%mm4, %%mm1 \n\t" + + "movq %%mm7, (%%esi) \n\t" "psraw %%mm2, %%mm0 \n\t" "psraw %%mm2, %%mm1 \n\t" + + "movq %%mm7, 8(%%esi) \n\t" "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%1) \n\t" - "add $16, %0 \n\t" - "add $8, %1 \n\t" - "cmp %2, %1 \n\t" - " jb 1b \n\t" - : "+r" (src1), "+r"(dst1) - : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(6-log2_scale) + "addl $16, %%esi \n\t" + + "movq %%mm0, (%%edi) \n\t" + "addl $8, %%edi \n\t" + "subl $8, %%ecx \n\t" + "jg L_1b \n\t" + "addl %1, %%esi \n\t" + "addl $8, %%edx \n\t" + "addl %3, %%edi \n\t" + "cmpl %4, %%edx \n\t" + "jl L_2b \n\t" + + : + : "g" (width), "a" (src_stride), "g" (od), "g" (dst_stride), "g" (height), + "d" (log2_scale), "S" (src), "D" (dst) //input + : "%eax", "%ecx", "%edx", "%edi", "%esi" ); - src += src_stride; - dst += dst_stride; } -// if(width != mmxw) -// store_slice_c(dst + mmxw, src + mmxw, dst_stride, src_stride, width - mmxw, log2_scale); + +//This func reads from 2 slices, 0 & 2 and clears 2-nd +static void store_slice2_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale) +{const uint8_t *od=&dither[0][0]; + dst_stride-=width; + //src_stride=(src_stride-width)*2; + height=(int)(&dither[height][0]); + asm volatile( + "movd %%edx, %%mm5 \n\t" + "xorl $-1, %%edx \n\t" + "movl %%eax, %%ecx \n\t" + "addl $7, %%edx \n\t" + "subl %0, %%ecx \n\t" + "addl %%ecx, %%ecx \n\t" + "movd %%edx, %%mm2 \n\t" + "movl %%ecx, %1 \n\t" + "movl %2, %%edx \n\t" + "shll $5, %%eax \n\t" + +"L_2: \n\t" + "movq (%%edx), %%mm3 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "movl %0, %%ecx \n\t" + "psraw %%mm5, %%mm3 \n\t" + "psraw %%mm5, %%mm4 \n\t" +"L_1: \n\t" + "movq (%%esi), %%mm0 \n\t" + "movq 8(%%esi), %%mm1 \n\t" + "paddw %%mm3, %%mm0 \n\t" + + "paddw (%%esi,%%eax,), %%mm0 \n\t" + "paddw %%mm4, %%mm1 \n\t" + "movq 8(%%esi,%%eax,), %%mm6 \n\t" + + "movq %%mm7, (%%esi,%%eax,) \n\t" + "psraw %%mm2, %%mm0 \n\t" + "paddw %%mm6, %%mm1 \n\t" + + "movq %%mm7, 8(%%esi,%%eax,) \n\t" + "psraw %%mm2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%edi) \n\t" + "addl $16, %%esi \n\t" + "addl $8, %%edi \n\t" + "subl $8, %%ecx \n\t" + "jg L_1b \n\t" + "addl %1, %%esi \n\t" + "addl $8, %%edx \n\t" + "addl %3, %%edi \n\t" + "cmpl %4, %%edx \n\t" + "jl L_2b \n\t" + + : + : "g" (width), "a" (src_stride), "g" (od), "g" (dst_stride), "g" (height), + "d" (log2_scale), "S" (src), "D" (dst) //input + : "%eax", "%ecx", "%edx", "%edi", "%esi" + ); } -#endif -static void (*store_slice)(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)= store_slice_c; +static void mul_thrmat_mmx(int q) +{int adr=(int)(&p->threshold_mtx_noq[0]); + asm volatile( + "movd %0, %%mm7 \n\t" + "addl $8*8*2, %%edi \n\t" + "movq 0*8(%%esi), %%mm0 \n\t" + "punpcklwd %%mm7, %%mm7 \n\t" + "movq 1*8(%%esi), %%mm1 \n\t" + "punpckldq %%mm7, %%mm7 \n\t" + "pmullw %%mm7, %%mm0 \n\t" + + "movq 2*8(%%esi), %%mm2 \n\t" + "pmullw %%mm7, %%mm1 \n\t" + + "movq 3*8(%%esi), %%mm3 \n\t" + "pmullw %%mm7, %%mm2 \n\t" + + "movq %%mm0, 0*8(%%edi) \n\t" + "movq 4*8(%%esi), %%mm4 \n\t" + "pmullw %%mm7, %%mm3 \n\t" + + "movq %%mm1, 1*8(%%edi) \n\t" + "movq 5*8(%%esi), %%mm5 \n\t" + "pmullw %%mm7, %%mm4 \n\t" + + "movq %%mm2, 2*8(%%edi) \n\t" + "movq 6*8(%%esi), %%mm6 \n\t" + "pmullw %%mm7, %%mm5 \n\t" + + "movq %%mm3, 3*8(%%edi) \n\t" + "movq 7*8+0*8(%%esi), %%mm0 \n\t" + "pmullw %%mm7, %%mm6 \n\t" + + "movq %%mm4, 4*8(%%edi) \n\t" + "movq 7*8+1*8(%%esi), %%mm1 \n\t" + "pmullw %%mm7, %%mm0 \n\t" + + "movq %%mm5, 5*8(%%edi) \n\t" + "movq 7*8+2*8(%%esi), %%mm2 \n\t" + "pmullw %%mm7, %%mm1 \n\t" + + "movq %%mm6, 6*8(%%edi) \n\t" + "movq 7*8+3*8(%%esi), %%mm3 \n\t" + "pmullw %%mm7, %%mm2 \n\t" + + "movq %%mm0, 7*8+0*8(%%edi) \n\t" + "movq 7*8+4*8(%%esi), %%mm4 \n\t" + "pmullw %%mm7, %%mm3 \n\t" + + "movq %%mm1, 7*8+1*8(%%edi) \n\t" + "movq 7*8+5*8(%%esi), %%mm5 \n\t" + "pmullw %%mm7, %%mm4 \n\t" + + "movq %%mm2, 7*8+2*8(%%edi) \n\t" + "movq 7*8+6*8(%%esi), %%mm6 \n\t" + "pmullw %%mm7, %%mm5 \n\t" + + "movq %%mm3, 7*8+3*8(%%edi) \n\t" + "movq 14*8+0*8(%%esi), %%mm0 \n\t" + "pmullw %%mm7, %%mm6 \n\t" + + "movq %%mm4, 7*8+4*8(%%edi) \n\t" + "movq 14*8+1*8(%%esi), %%mm1 \n\t" + "pmullw %%mm7, %%mm0 \n\t" + + "movq %%mm5, 7*8+5*8(%%edi) \n\t" + "pmullw %%mm7, %%mm1 \n\t" + + "movq %%mm6, 7*8+6*8(%%edi) \n\t" + "movq %%mm0, 14*8+0*8(%%edi) \n\t" + "movq %%mm1, 14*8+1*8(%%edi) \n\t" + + : + : "g" (q), "S" (adr), "D" (adr) + : "%edi", "%esi" + ); +} -static void (*requantize)(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation)= hardthresh_c; +static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt); +static void row_idct_mmx(DCTELEM* workspace, + int16_t* output_adr, int output_stride, int cnt); +static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt) + +#define store_slice_s store_slice_mmx +#define store_slice2_s store_slice2_mmx +#define mul_thrmat_s mul_thrmat_mmx +#define column_fidct_s column_fidct_mmx +#define row_idct_s row_idct_mmx +#define row_fdct_s row_fdct_mmx +#endif static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){ - int x, y, i; - const int count= 1<log2_count; - const int stride= is_luma ? p->temp_stride : ((width+16+15)&(~15)); - uint64_t block_align[32]; + int x, x0, y, es, qy, t; + const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15)) + const int step=6-p->log2_count; + const int qps= 3 + is_luma; + int32_t block_align1[4*8*BLOCKSZ+ 4*8*BLOCKSZ+8];//32 + int32_t *block_align=(int32_t*)(((int)block_align1+31)&(~31)); DCTELEM *block = (DCTELEM *)block_align; - DCTELEM *block2= (DCTELEM *)(block_align+16); + DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ); + //p->src=src-src_stride*8-8;//! if (!src || !dst) return; // HACK avoid crash for Y8 colourspace for(y=0; ysrc + index, src + y*src_stride, width); + memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers for(x=0; x<8; x++){ p->src[index - x - 1]= p->src[index + x ]; p->src[index + width + x ]= p->src[index + width - x - 1]; @@ -398,43 +420,54 @@ } //FIXME (try edge emu) - for(y=0; ytemp + (8+y)*stride, 0, 8*stride*sizeof(int16_t)); - for(x=0; xtemp+ 8 +y*stride, 0,width*sizeof(int16_t)); + for(y=step; yheight-1) qy=height-1; + if (qy<0) qy=0; + qy=(qy>>qps)*qp_stride; + + row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2); + for(x0=0; x0src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1)); if(p->qp) - qp= p->qp; - else{ - qp= qp_store[ (XMIN(x, width-1)>>qps) + (XMIN(y, height-1)>>qps) * qp_stride]; - if(p->mpeg2) qp>>=1; - } - for(i=0; idsp.get_pixels(block, p->src + index, stride); - p->dsp.fdct(block); - requantize(block2, block, qp, p->dsp.idct_permutation); - p->dsp.idct(block2); - add_block(p->temp + index, stride, block2); - } - } - if(y) - store_slice(dst + (y-8)*dst_stride, p->temp + 8 + y*stride, dst_stride, stride, width, XMIN(8, height+8-y), 6-p->log2_count); - } -#if 0 - for(y=0; y>6) ^ (y>>6)) & 1) == 0) - dst[x + y*dst_stride]= p->src[8 + 8*stride + x + y*stride]; - if((x&63) == 0 || (y&63)==0) - dst[x + y*dst_stride] += 128; - } + column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT + else + for (x=0; x<8*(BLOCKSZ-1); x+=8) { + t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same + if (t<0) t=0;//t always < width-2 + t=qp_store[qy+(t>>qps)]; + if(p->mpeg2) t>>=1; //copy p->mpeg2,prev_q to locals? + if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(t); + column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT + } + row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1)); + memcpy(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling + memcpy(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM)); + } + // + es=width+8-x0; // 8, ... + if (es>8) + row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2); + column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es-0); + row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2); + {const int y1=y-8+step;//l5-7 l4-6 + if (!(y1&7) && y1) { + if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride, + dst_stride, stride, width, 8, 5-p->log2_count); + else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride, + dst_stride, stride, width, 8, 5-p->log2_count); + } } + } + + if (y&7) { // == height & 7 + if (y&8) store_slice_s(dst + (y-8&~7)*dst_stride, p->temp+ 8 +8*stride, + dst_stride, stride, width, y&7, 5-p->log2_count); + else store_slice2_s(dst + (y-8&~7)*dst_stride, p->temp+ 8 +0*stride, + dst_stride, stride, width, y&7, 5-p->log2_count); } -#endif - //FIXME reorder for better caching } static int config(struct vf_instance_s* vf, @@ -443,8 +476,9 @@ int h= (height+16+15)&(~15); vf->priv->temp_stride= (width+16+15)&(~15); - vf->priv->temp= malloc(vf->priv->temp_stride*h*sizeof(int16_t)); - vf->priv->src = malloc(vf->priv->temp_stride*h*sizeof(uint8_t)); + vf->priv->temp= (int16_t*)av_malloc(vf->priv->temp_stride*3*8*sizeof(int16_t)); + //this can also be avoided, see above + vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t)); return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt); } @@ -506,14 +540,14 @@ static void uninit(struct vf_instance_s* vf){ if(!vf->priv) return; - if(vf->priv->temp) free(vf->priv->temp); + if(vf->priv->temp) av_free(vf->priv->temp); vf->priv->temp= NULL; - if(vf->priv->src) free(vf->priv->src); + if(vf->priv->src) av_free(vf->priv->src); vf->priv->src= NULL; - if(vf->priv->avctx) free(vf->priv->avctx); - vf->priv->avctx= NULL; + //if(vf->priv->avctx) free(vf->priv->avctx); + //vf->priv->avctx= NULL; - free(vf->priv); + av_free(vf->priv); vf->priv=NULL; } @@ -554,16 +588,18 @@ static int control(struct vf_instance_s* vf, int request, void* data){ switch(request){ case VFCTRL_QUERY_MAX_PP_LEVEL: - return 6; + return 5; case VFCTRL_SET_PP_LEVEL: vf->priv->log2_count= *((unsigned int*)data); + if (vf->priv->log2_count < 4) vf->priv->log2_count=4; return CONTROL_TRUE; } return vf_next_control(vf,request,data); } static int open(vf_instance_t *vf, char* args){ - + int i=0, bias; + int custom_threshold_m[64]; int log2c=-1; vf->config=config; @@ -572,39 +608,40 @@ vf->query_format=query_format; vf->uninit=uninit; vf->control= control; - vf->priv=malloc(sizeof(struct vf_priv_s)); - memset(vf->priv, 0, sizeof(struct vf_priv_s)); + vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 ! avcodec_init(); - vf->priv->avctx= avcodec_alloc_context(); - dsputil_init(&vf->priv->dsp, vf->priv->avctx); + //vf->priv->avctx= avcodec_alloc_context(); + //dsputil_init(&vf->priv->dsp, vf->priv->avctx); - vf->priv->log2_count= 3; + vf->priv->log2_count= 4; - if (args) sscanf(args, "%d:%d:%d", &log2c, &vf->priv->qp, &vf->priv->mode); + if (args) sscanf(args, "%d:%d:%d", &log2c, &vf->priv->qp, &i); - if( log2c >=0 && log2c <=6 ) + if( log2c >=4 && log2c <=5 ) vf->priv->log2_count = log2c; if(vf->priv->qp < 0) vf->priv->qp = 0; - switch(vf->priv->mode){ - default: - case 0: requantize= hardthresh_c; break; - case 1: requantize= softthresh_c; break; + bias= (1<<4)+i; //regulable + vf->priv->prev_q=0; + // + for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this ! + custom_threshold_m[i]=int(custom_threshold[i]*(bias/71.)+ 0.5); + for(i=0;i<8;i++){ + vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2] + |(((uint64_t)custom_threshold_m[i*8+6])<<16) + |(((uint64_t)custom_threshold_m[i*8+0])<<32) + |(((uint64_t)custom_threshold_m[i*8+4])<<48); + vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5] + |(((uint64_t)custom_threshold_m[i*8+3])<<16) + |(((uint64_t)custom_threshold_m[i*8+1])<<32) + |(((uint64_t)custom_threshold_m[i*8+7])<<48); } -#ifdef HAVE_MMX - if(gCpuCaps.hasMMX){ - store_slice= store_slice_mmx; - switch(vf->priv->mode){ - case 0: requantize= hardthresh_mmx; break; - case 1: requantize= softthresh_mmx; break; - } - } -#endif + if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv->qp); return 1; } @@ -617,5 +654,1409 @@ open, NULL }; + +//==================================================================== +//Specific spp's dct, idct and threshold functions +//I'd prefer to have them in the separate file. + +//#define MANGLE(a) #a + +typedef int16_t DCTELEM; + +#define DCTSIZE 8 +#define DCTSIZE_S "8" + +#define FIX(x,s) ((int) ((x) * (1<>16) +#define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0; +#define DESCALE(x,n) ((x) + (1 << ((n)-1)) >> n) + +static uint64_t temps[4];//!! + +static const uint64_t MM_FIX_0_382683433=FIX64(0.382683433, 14); +static const uint64_t MM_FIX_0_541196100=FIX64(0.541196100, 14); +static const uint64_t MM_FIX_0_707106781=FIX64(0.707106781, 14); +static const uint64_t MM_FIX_1_306562965=FIX64(1.306562965, 14); + +static const uint64_t MM_FIX_1_414213562_A=FIX64(1.414213562, 14); + +static const uint64_t MM_FIX_1_847759065=FIX64(1.847759065, 13); +static const uint64_t MM_FIX_2_613125930=FIX64(-2.613125930, 13); //- +static const uint64_t MM_FIX_1_414213562=FIX64(1.414213562, 13); +static const uint64_t MM_FIX_1_082392200=FIX64(1.082392200, 13); +//for t3,t5,t7 == 0 shortcut +static const uint64_t MM_FIX_0_847759065=FIX64(0.847759065, 14); +static const uint64_t MM_FIX_0_566454497=FIX64(0.566454497, 14); +static const uint64_t MM_FIX_0_198912367=FIX64(0.198912367, 14); + +static const uint64_t MM_DESCALE_RND=C64(4); +static const uint64_t MM_2=C64(2); + + +typedef int32_t int_simd16_t; +static const int16_t FIX_0_382683433=FIX(0.382683433, 14); +static const int16_t FIX_0_541196100=FIX(0.541196100, 14); +static const int16_t FIX_0_707106781=FIX(0.707106781, 14); +static const int16_t FIX_1_306562965=FIX(1.306562965, 14); +static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14); +static const int16_t FIX_1_847759065=FIX(1.847759065, 13); +static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //- +static const int16_t FIX_1_414213562=FIX(1.414213562, 13); +static const int16_t FIX_1_082392200=FIX(1.082392200, 13); + +static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt) +{int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13; + int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7; + + DCTELEM* dataptr; + DCTELEM* wsptr; + int16_t *threshold; + int ctr; + + dataptr = data; + wsptr = output; + + for (; cnt > 0; cnt-=2) { //start positions + threshold=(int16_t*)thr_adr;//threshold_mtx + for (ctr = DCTSIZE; ctr > 0; ctr--) { + // Process columns from input, add to output. + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + // Even part of FDCT + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + d0 = tmp10 + tmp11; + d4 = tmp10 - tmp11; + + z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781); + d2 = tmp13 + z1; + d6 = tmp13 - z1; + + // Even part of IDCT + + THRESHOLD(tmp0, d0, threshold[0*8]); + THRESHOLD(tmp1, d2, threshold[2*8]); + THRESHOLD(tmp2, d4, threshold[4*8]); + THRESHOLD(tmp3, d6, threshold[6*8]); + tmp0+=2; + tmp10 = (tmp0 + tmp2)>>2; + tmp11 = (tmp0 - tmp2)>>2; + + tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides) + tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2 + + tmp0 = tmp10 + tmp13; //->temps + tmp3 = tmp10 - tmp13; //->temps + tmp1 = tmp11 + tmp12; //->temps + tmp2 = tmp11 - tmp12; //->temps + + // Odd part of FDCT + + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433); + z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5; + z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5; + z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781); + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + d5 = z13 + z2; + d3 = z13 - z2; + d1 = z11 + z4; + d7 = z11 - z4; + + // Odd part of IDCT + + THRESHOLD(tmp4, d1, threshold[1*8]); + THRESHOLD(tmp5, d3, threshold[3*8]); + THRESHOLD(tmp6, d5, threshold[5*8]); + THRESHOLD(tmp7, d7, threshold[7*8]); + + //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0 + z13 = tmp6 + tmp5; + z10 = (tmp6 - tmp5)<<1; + z11 = tmp4 + tmp7; + z12 = (tmp4 - tmp7)<<1; + + tmp7 = (z11 + z13)>>2; //+2 ! + tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562); + z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !! + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + wsptr[DCTSIZE*0]+= (tmp0 + tmp7); + wsptr[DCTSIZE*1]+= (tmp1 + tmp6); + wsptr[DCTSIZE*2]+= (tmp2 + tmp5); + wsptr[DCTSIZE*3]+= (tmp3 - tmp4); + wsptr[DCTSIZE*4]+= (tmp3 + tmp4); + wsptr[DCTSIZE*5]+= (tmp2 - tmp5); + wsptr[DCTSIZE*6]= (tmp1 - tmp6); + wsptr[DCTSIZE*7]= (tmp0 - tmp7); + // + dataptr++; //next column + wsptr++; + threshold++; + } + dataptr+=8; //skip each second start pos + wsptr +=8; + } +} + +static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt) +{//uint64_t temps[4]; + asm volatile( +".align 16 \n\t" +"L_COLUMS: \n\t" + "movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t" + // + "movq "DCTSIZE_S"*3*2(%%esi), %%mm7 \n\t" + "movq %%mm1, %%mm0 \n\t" + + "paddw "DCTSIZE_S"*7*2(%%esi), %%mm1 \n\t" //t0 + "movq %%mm7, %%mm3 \n\t" + + "paddw "DCTSIZE_S"*4*2(%%esi), %%mm7 \n\t" //t3 + "movq %%mm1, %%mm5 \n\t" + + "movq "DCTSIZE_S"*1*2(%%esi), %%mm6 \n\t" + "psubw %%mm7, %%mm1 \n\t" //t13 + + "movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t" + "movq %%mm6, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*6*2(%%esi), %%mm6 \n\t" //t1 + "paddw %%mm7, %%mm5 \n\t" //t10 + + "paddw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t2 + "movq %%mm6, %%mm7 \n\t" + + "paddw %%mm2, %%mm6 \n\t" //t11 + "psubw %%mm2, %%mm7 \n\t" //t12 + + "movq %%mm5, %%mm2 \n\t" + "paddw %%mm6, %%mm5 \n\t" //d0 + // i0 t13 t12 i3 i1 d0 - d4 + "psubw %%mm6, %%mm2 \n\t" //d4 + "paddw %%mm1, %%mm7 \n\t" + + "movq 4*16(%%edx), %%mm6 \n\t" + "psllw $2, %%mm7 \n\t" + + "psubw 0*16(%%edx), %%mm5 \n\t" + "psubw %%mm6, %%mm2 \n\t" + + "paddusw 0*16(%%edx), %%mm5 \n\t" + "paddusw %%mm6, %%mm2 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t" + // + "paddw 0*16(%%edx), %%mm5 \n\t" + "paddw %%mm6, %%mm2 \n\t" + + "psubusw 0*16(%%edx), %%mm5 \n\t" + "psubusw %%mm6, %%mm2 \n\t" + +//This func is totally compute-bound, operates at huge speed. So, DC shortcut +// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). +//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. + "paddw "MANGLE(MM_2)", %%mm5 \n\t" + "movq %%mm2, %%mm6 \n\t" + + "paddw %%mm5, %%mm2 \n\t" + "psubw %%mm6, %%mm5 \n\t" + + "movq %%mm1, %%mm6 \n\t" + "paddw %%mm7, %%mm1 \n\t" //d2 + + "psubw 2*16(%%edx), %%mm1 \n\t" + "psubw %%mm7, %%mm6 \n\t" //d6 + + "movq 6*16(%%edx), %%mm7 \n\t" + "psraw $2, %%mm5 \n\t" + + "paddusw 2*16(%%edx), %%mm1 \n\t" + "psubw %%mm7, %%mm6 \n\t" + // t7 d2 /t11 t4 t6 - d6 /t10 + + "paddw 2*16(%%edx), %%mm1 \n\t" + "paddusw %%mm7, %%mm6 \n\t" + + "psubusw 2*16(%%edx), %%mm1 \n\t" + "paddw %%mm7, %%mm6 \n\t" + + "psubw "DCTSIZE_S"*4*2(%%esi), %%mm3 \n\t" + "psubusw %%mm7, %%mm6 \n\t" + + //movq [edi+"DCTSIZE_S"*2*2], mm1 + //movq [edi+"DCTSIZE_S"*6*2], mm6 + "movq %%mm1, %%mm7 \n\t" + "psraw $2, %%mm2 \n\t" + + "psubw "DCTSIZE_S"*6*2(%%esi), %%mm4 \n\t" + "psubw %%mm6, %%mm1 \n\t" + + "psubw "DCTSIZE_S"*7*2(%%esi), %%mm0 \n\t" + "paddw %%mm7, %%mm6 \n\t" //'t13 + + "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- + "movq %%mm2, %%mm7 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" + "paddw %%mm6, %%mm2 \n\t" //'t0 + + "movq %%mm2, "MANGLE(temps)"+0*8 \n\t" //! + "psubw %%mm6, %%mm7 \n\t" //'t3 + + "movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t" + "psubw %%mm6, %%mm1 \n\t" //'t12 + + "psubw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t5 + "movq %%mm5, %%mm6 \n\t" + + "movq %%mm7, "MANGLE(temps)"+3*8 \n\t" + "paddw %%mm2, %%mm3 \n\t" //t10 + + "paddw %%mm4, %%mm2 \n\t" //t11 + "paddw %%mm0, %%mm4 \n\t" //t12 + + "movq %%mm3, %%mm7 \n\t" + "psubw %%mm4, %%mm3 \n\t" + + "psllw $2, %%mm3 \n\t" + "psllw $2, %%mm7 \n\t" //opt for P6 + + "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" + "psllw $2, %%mm4 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t" + "psllw $2, %%mm2 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" + "paddw %%mm1, %%mm5 \n\t" //'t1 + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t" + "psubw %%mm1, %%mm6 \n\t" //'t2 + // t7 't12 't11 t4 t6 - 't13 't10 --- + + "paddw %%mm3, %%mm7 \n\t" //z2 + + "movq %%mm5, "MANGLE(temps)"+1*8 \n\t" + "paddw %%mm3, %%mm4 \n\t" //z4 + + "movq 3*16(%%edx), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + + "movq %%mm6, "MANGLE(temps)"+2*8 \n\t" + "psubw %%mm2, %%mm1 \n\t" //z13 + +//=== + "paddw %%mm2, %%mm0 \n\t" //z11 + "movq %%mm1, %%mm5 \n\t" + + "movq 5*16(%%edx), %%mm2 \n\t" + "psubw %%mm7, %%mm1 \n\t" //d3 + + "paddw %%mm7, %%mm5 \n\t" //d5 + "psubw %%mm3, %%mm1 \n\t" + + "movq 1*16(%%edx), %%mm7 \n\t" + "psubw %%mm2, %%mm5 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "paddw %%mm4, %%mm0 \n\t" //d1 + + "paddusw %%mm3, %%mm1 \n\t" + "psubw %%mm4, %%mm6 \n\t" //d7 + + // d1 d3 - - - d5 d7 - + "movq 7*16(%%edx), %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + + "psubw %%mm4, %%mm6 \n\t" + "paddusw %%mm2, %%mm5 \n\t" + + "paddusw %%mm4, %%mm6 \n\t" + "paddw %%mm3, %%mm1 \n\t" + + "paddw %%mm2, %%mm5 \n\t" + "paddw %%mm4, %%mm6 \n\t" + + "psubusw %%mm3, %%mm1 \n\t" + "psubusw %%mm2, %%mm5 \n\t" + + "psubusw %%mm4, %%mm6 \n\t" + "movq %%mm1, %%mm4 \n\t" + + "por %%mm5, %%mm4 \n\t" + "paddusw %%mm7, %%mm0 \n\t" + + "por %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm0 \n\t" + + "packssdw %%mm4, %%mm4 \n\t" + "psubusw %%mm7, %%mm0 \n\t" + + "movd %%mm4, %%eax \n\t" + "orl %%eax, %%eax \n\t" + "jnz L_NDC21f \n\t" + //movq [edi+"DCTSIZE_S"*3*2], mm1 + //movq [edi+"DCTSIZE_S"*5*2], mm5 + //movq [edi+"DCTSIZE_S"*1*2], mm0 + //movq [edi+"DCTSIZE_S"*7*2], mm6 + // t4 t5 - - - t6 t7 - + //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 +//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile + "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 + "movq %%mm1, %%mm2 \n\t" + + "movq "DCTSIZE_S"*0*2(%%edi), %%mm5 \n\t" + "movq %%mm2, %%mm3 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 + "paddw %%mm4, %%mm5 \n\t" + + "movq "MANGLE(temps)"+1*8, %%mm6 \n\t" + //paddw mm3, MM_2 + "psraw $2, %%mm3 \n\t" //tmp7 + + "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 + "psubw %%mm3, %%mm4 \n\t" + + "movq "DCTSIZE_S"*1*2(%%edi), %%mm7 \n\t" + "paddw %%mm3, %%mm5 \n\t" + + "movq %%mm4, "DCTSIZE_S"*7*2(%%edi) \n\t" + "paddw %%mm6, %%mm7 \n\t" + + "movq "MANGLE(temps)"+2*8, %%mm3 \n\t" + "psubw %%mm0, %%mm6 \n\t" + + "movq "DCTSIZE_S"*2*2(%%edi), %%mm4 \n\t" + "paddw %%mm0, %%mm7 \n\t" + + "movq %%mm5, "DCTSIZE_S"*0*2(%%edi) \n\t" + "paddw %%mm3, %%mm4 \n\t" + + "movq %%mm6, "DCTSIZE_S"*6*2(%%edi) \n\t" + "psubw %%mm1, %%mm3 \n\t" + + "movq "DCTSIZE_S"*5*2(%%edi), %%mm5 \n\t" + "paddw %%mm1, %%mm4 \n\t" + + "movq "DCTSIZE_S"*3*2(%%edi), %%mm6 \n\t" + "paddw %%mm3, %%mm5 \n\t" + + "movq "MANGLE(temps)"+3*8, %%mm0 \n\t" + "addl $8, %%esi \n\t" + + "movq %%mm7, "DCTSIZE_S"*1*2(%%edi) \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "movq %%mm4, "DCTSIZE_S"*2*2(%%edi) \n\t" + "psubw %%mm2, %%mm0 \n\t" + + "movq "DCTSIZE_S"*4*2(%%edi), %%mm7 \n\t" + "paddw %%mm2, %%mm6 \n\t" + + "movq %%mm5, "DCTSIZE_S"*5*2(%%edi) \n\t" + "paddw %%mm0, %%mm7 \n\t" + + "movq %%mm6, "DCTSIZE_S"*3*2(%%edi) \n\t" + + "movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t" + "addl $8, %%edi \n\t" + "jmp L_END1f \n\t" + +"L_NDC21: \n\t" + //--- non DC2 + //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) + //psraw mm5, 2 + //psraw mm0, 2 + //psraw mm6, 2 + "movq %%mm5, %%mm3 \n\t" + "psubw %%mm1, %%mm5 \n\t" + + "psllw $1, %%mm5 \n\t" //'z10 + "paddw %%mm1, %%mm3 \n\t" //'z13 + + "movq %%mm0, %%mm2 \n\t" + "psubw %%mm6, %%mm0 \n\t" + + "movq %%mm5, %%mm1 \n\t" + "psllw $1, %%mm0 \n\t" //'z12 + + "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- + "paddw %%mm0, %%mm5 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 + "paddw %%mm6, %%mm2 \n\t" //'z11 + + "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" + "movq %%mm2, %%mm7 \n\t" + + //--- + "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" + "psubw %%mm3, %%mm2 \n\t" + + "psllw $1, %%mm2 \n\t" + "paddw %%mm3, %%mm7 \n\t" //'t7 + + "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 + "movq %%mm4, %%mm6 \n\t" + //paddw mm7, MM_2 + "psraw $2, %%mm7 \n\t" + + "paddw "DCTSIZE_S"*0*2(%%edi), %%mm4 \n\t" + "psubw %%mm7, %%mm6 \n\t" + + "movq "MANGLE(temps)"+1*8, %%mm3 \n\t" + "paddw %%mm7, %%mm4 \n\t" + + "movq %%mm6, "DCTSIZE_S"*7*2(%%edi) \n\t" + "paddw %%mm5, %%mm1 \n\t" //'t12 + + "movq %%mm4, "DCTSIZE_S"*0*2(%%edi) \n\t" + "psubw %%mm7, %%mm1 \n\t" //'t6 + + "movq "MANGLE(temps)"+2*8, %%mm7 \n\t" + "psubw %%mm5, %%mm0 \n\t" //'t10 + + "movq "MANGLE(temps)"+3*8, %%mm6 \n\t" + "movq %%mm3, %%mm5 \n\t" + + "paddw "DCTSIZE_S"*1*2(%%edi), %%mm3 \n\t" + "psubw %%mm1, %%mm5 \n\t" + + "psubw %%mm1, %%mm2 \n\t" //'t5 + "paddw %%mm1, %%mm3 \n\t" + + "movq %%mm5, "DCTSIZE_S"*6*2(%%edi) \n\t" + "movq %%mm7, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*2*2(%%edi), %%mm7 \n\t" + "psubw %%mm2, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*5*2(%%edi), %%mm4 \n\t" + "paddw %%mm2, %%mm7 \n\t" + + "movq %%mm3, "DCTSIZE_S"*1*2(%%edi) \n\t" + "paddw %%mm2, %%mm0 \n\t" //'t4 + + // 't4 't6 't5 - - - - 't7 + "movq %%mm7, "DCTSIZE_S"*2*2(%%edi) \n\t" + "movq %%mm6, %%mm1 \n\t" + + "paddw "DCTSIZE_S"*4*2(%%edi), %%mm6 \n\t" + "psubw %%mm0, %%mm1 \n\t" + + "paddw "DCTSIZE_S"*3*2(%%edi), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "movq %%mm4, "DCTSIZE_S"*5*2(%%edi) \n\t" + "addl $8, %%esi \n\t" + + "movq %%mm6, "DCTSIZE_S"*4*2(%%edi) \n\t" + + "movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t" + "addl $8, %%edi \n\t" + +"L_END1: \n\t" +//=part 2 (the same)=========================================================== + "movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t" + // + "movq "DCTSIZE_S"*3*2(%%esi), %%mm7 \n\t" + "movq %%mm1, %%mm0 \n\t" + + "paddw "DCTSIZE_S"*7*2(%%esi), %%mm1 \n\t" //t0 + "movq %%mm7, %%mm3 \n\t" + + "paddw "DCTSIZE_S"*4*2(%%esi), %%mm7 \n\t" //t3 + "movq %%mm1, %%mm5 \n\t" + + "movq "DCTSIZE_S"*1*2(%%esi), %%mm6 \n\t" + "psubw %%mm7, %%mm1 \n\t" //t13 + + "movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t" + "movq %%mm6, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*6*2(%%esi), %%mm6 \n\t" //t1 + "paddw %%mm7, %%mm5 \n\t" //t10 + + "paddw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t2 + "movq %%mm6, %%mm7 \n\t" + + "paddw %%mm2, %%mm6 \n\t" //t11 + "psubw %%mm2, %%mm7 \n\t" //t12 + + "movq %%mm5, %%mm2 \n\t" + "paddw %%mm6, %%mm5 \n\t" //d0 + // i0 t13 t12 i3 i1 d0 - d4 + "psubw %%mm6, %%mm2 \n\t" //d4 + "paddw %%mm1, %%mm7 \n\t" + + "movq 1*8+4*16(%%edx), %%mm6 \n\t" + "psllw $2, %%mm7 \n\t" + + "psubw 1*8+0*16(%%edx), %%mm5 \n\t" + "psubw %%mm6, %%mm2 \n\t" + + "paddusw 1*8+0*16(%%edx), %%mm5 \n\t" + "paddusw %%mm6, %%mm2 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t" + // + "paddw 1*8+0*16(%%edx), %%mm5 \n\t" + "paddw %%mm6, %%mm2 \n\t" + + "psubusw 1*8+0*16(%%edx), %%mm5 \n\t" + "psubusw %%mm6, %%mm2 \n\t" + +//This func is totally compute-bound, operates at huge speed. So, DC shortcut +// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). +//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. + "paddw "MANGLE(MM_2)", %%mm5 \n\t" + "movq %%mm2, %%mm6 \n\t" + + "paddw %%mm5, %%mm2 \n\t" + "psubw %%mm6, %%mm5 \n\t" + + "movq %%mm1, %%mm6 \n\t" + "paddw %%mm7, %%mm1 \n\t" //d2 + + "psubw 1*8+2*16(%%edx), %%mm1 \n\t" + "psubw %%mm7, %%mm6 \n\t" //d6 + + "movq 1*8+6*16(%%edx), %%mm7 \n\t" + "psraw $2, %%mm5 \n\t" + + "paddusw 1*8+2*16(%%edx), %%mm1 \n\t" + "psubw %%mm7, %%mm6 \n\t" + // t7 d2 /t11 t4 t6 - d6 /t10 + + "paddw 1*8+2*16(%%edx), %%mm1 \n\t" + "paddusw %%mm7, %%mm6 \n\t" + + "psubusw 1*8+2*16(%%edx), %%mm1 \n\t" + "paddw %%mm7, %%mm6 \n\t" + + "psubw "DCTSIZE_S"*4*2(%%esi), %%mm3 \n\t" + "psubusw %%mm7, %%mm6 \n\t" + + //movq [edi+"DCTSIZE_S"*2*2], mm1 + //movq [edi+"DCTSIZE_S"*6*2], mm6 + "movq %%mm1, %%mm7 \n\t" + "psraw $2, %%mm2 \n\t" + + "psubw "DCTSIZE_S"*6*2(%%esi), %%mm4 \n\t" + "psubw %%mm6, %%mm1 \n\t" + + "psubw "DCTSIZE_S"*7*2(%%esi), %%mm0 \n\t" + "paddw %%mm7, %%mm6 \n\t" //'t13 + + "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- + "movq %%mm2, %%mm7 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" + "paddw %%mm6, %%mm2 \n\t" //'t0 + + "movq %%mm2, "MANGLE(temps)"+0*8 \n\t" //! + "psubw %%mm6, %%mm7 \n\t" //'t3 + + "movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t" + "psubw %%mm6, %%mm1 \n\t" //'t12 + + "psubw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t5 + "movq %%mm5, %%mm6 \n\t" + + "movq %%mm7, "MANGLE(temps)"+3*8 \n\t" + "paddw %%mm2, %%mm3 \n\t" //t10 + + "paddw %%mm4, %%mm2 \n\t" //t11 + "paddw %%mm0, %%mm4 \n\t" //t12 + + "movq %%mm3, %%mm7 \n\t" + "psubw %%mm4, %%mm3 \n\t" + + "psllw $2, %%mm3 \n\t" + "psllw $2, %%mm7 \n\t" //opt for P6 + + "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" + "psllw $2, %%mm4 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t" + "psllw $2, %%mm2 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" + "paddw %%mm1, %%mm5 \n\t" //'t1 + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t" + "psubw %%mm1, %%mm6 \n\t" //'t2 + // t7 't12 't11 t4 t6 - 't13 't10 --- + + "paddw %%mm3, %%mm7 \n\t" //z2 + + "movq %%mm5, "MANGLE(temps)"+1*8 \n\t" + "paddw %%mm3, %%mm4 \n\t" //z4 + + "movq 1*8+3*16(%%edx), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + + "movq %%mm6, "MANGLE(temps)"+2*8 \n\t" + "psubw %%mm2, %%mm1 \n\t" //z13 + +//=== + "paddw %%mm2, %%mm0 \n\t" //z11 + "movq %%mm1, %%mm5 \n\t" + + "movq 1*8+5*16(%%edx), %%mm2 \n\t" + "psubw %%mm7, %%mm1 \n\t" //d3 + + "paddw %%mm7, %%mm5 \n\t" //d5 + "psubw %%mm3, %%mm1 \n\t" + + "movq 1*8+1*16(%%edx), %%mm7 \n\t" + "psubw %%mm2, %%mm5 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "paddw %%mm4, %%mm0 \n\t" //d1 + + "paddusw %%mm3, %%mm1 \n\t" + "psubw %%mm4, %%mm6 \n\t" //d7 + + // d1 d3 - - - d5 d7 - + "movq 1*8+7*16(%%edx), %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + + "psubw %%mm4, %%mm6 \n\t" + "paddusw %%mm2, %%mm5 \n\t" + + "paddusw %%mm4, %%mm6 \n\t" + "paddw %%mm3, %%mm1 \n\t" + + "paddw %%mm2, %%mm5 \n\t" + "paddw %%mm4, %%mm6 \n\t" + + "psubusw %%mm3, %%mm1 \n\t" + "psubusw %%mm2, %%mm5 \n\t" + + "psubusw %%mm4, %%mm6 \n\t" + "movq %%mm1, %%mm4 \n\t" + + "por %%mm5, %%mm4 \n\t" + "paddusw %%mm7, %%mm0 \n\t" + + "por %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm0 \n\t" + + "packssdw %%mm4, %%mm4 \n\t" + "psubusw %%mm7, %%mm0 \n\t" + + "movd %%mm4, %%eax \n\t" + "orl %%eax, %%eax \n\t" + "jnz L_NDC22f \n\t" + //movq [edi+"DCTSIZE_S"*3*2], mm1 + //movq [edi+"DCTSIZE_S"*5*2], mm5 + //movq [edi+"DCTSIZE_S"*1*2], mm0 + //movq [edi+"DCTSIZE_S"*7*2], mm6 + // t4 t5 - - - t6 t7 - + //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 +//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile + "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 + "movq %%mm1, %%mm2 \n\t" + + "movq "DCTSIZE_S"*0*2(%%edi), %%mm5 \n\t" + "movq %%mm2, %%mm3 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 + "paddw %%mm4, %%mm5 \n\t" + + "movq "MANGLE(temps)"+1*8, %%mm6 \n\t" + //paddw mm3, MM_2 + "psraw $2, %%mm3 \n\t" //tmp7 + + "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 + "psubw %%mm3, %%mm4 \n\t" + + "movq "DCTSIZE_S"*1*2(%%edi), %%mm7 \n\t" + "paddw %%mm3, %%mm5 \n\t" + + "movq %%mm4, "DCTSIZE_S"*7*2(%%edi) \n\t" + "paddw %%mm6, %%mm7 \n\t" + + "movq "MANGLE(temps)"+2*8, %%mm3 \n\t" + "psubw %%mm0, %%mm6 \n\t" + + "movq "DCTSIZE_S"*2*2(%%edi), %%mm4 \n\t" + "paddw %%mm0, %%mm7 \n\t" + + "movq %%mm5, "DCTSIZE_S"*0*2(%%edi) \n\t" + "paddw %%mm3, %%mm4 \n\t" + + "movq %%mm6, "DCTSIZE_S"*6*2(%%edi) \n\t" + "psubw %%mm1, %%mm3 \n\t" + + "movq "DCTSIZE_S"*5*2(%%edi), %%mm5 \n\t" + "paddw %%mm1, %%mm4 \n\t" + + "movq "DCTSIZE_S"*3*2(%%edi), %%mm6 \n\t" + "paddw %%mm3, %%mm5 \n\t" + + "movq "MANGLE(temps)"+3*8, %%mm0 \n\t" + "addl $24, %%esi \n\t" + + "movq %%mm7, "DCTSIZE_S"*1*2(%%edi) \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "movq %%mm4, "DCTSIZE_S"*2*2(%%edi) \n\t" + "psubw %%mm2, %%mm0 \n\t" + + "movq "DCTSIZE_S"*4*2(%%edi), %%mm7 \n\t" + "paddw %%mm2, %%mm6 \n\t" + + "movq %%mm5, "DCTSIZE_S"*5*2(%%edi) \n\t" + "paddw %%mm0, %%mm7 \n\t" + + "movq %%mm6, "DCTSIZE_S"*3*2(%%edi) \n\t" + + "movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t" + "addl $24, %%edi \n\t" + "subl $2, %%ecx \n\t" + "jnz L_COLUMSb \n\t" + "jmp L_ENDf \n\t" + +"L_NDC22: \n\t" + //--- non DC2 + //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) + //psraw mm5, 2 + //psraw mm0, 2 + //psraw mm6, 2 + "movq %%mm5, %%mm3 \n\t" + "psubw %%mm1, %%mm5 \n\t" + + "psllw $1, %%mm5 \n\t" //'z10 + "paddw %%mm1, %%mm3 \n\t" //'z13 + + "movq %%mm0, %%mm2 \n\t" + "psubw %%mm6, %%mm0 \n\t" + + "movq %%mm5, %%mm1 \n\t" + "psllw $1, %%mm0 \n\t" //'z12 + + "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- + "paddw %%mm0, %%mm5 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 + "paddw %%mm6, %%mm2 \n\t" //'z11 + + "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" + "movq %%mm2, %%mm7 \n\t" + + //--- + "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" + "psubw %%mm3, %%mm2 \n\t" + + "psllw $1, %%mm2 \n\t" + "paddw %%mm3, %%mm7 \n\t" //'t7 + + "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 + "movq %%mm4, %%mm6 \n\t" + //paddw mm7, MM_2 + "psraw $2, %%mm7 \n\t" + + "paddw "DCTSIZE_S"*0*2(%%edi), %%mm4 \n\t" + "psubw %%mm7, %%mm6 \n\t" + + "movq "MANGLE(temps)"+1*8, %%mm3 \n\t" + "paddw %%mm7, %%mm4 \n\t" + + "movq %%mm6, "DCTSIZE_S"*7*2(%%edi) \n\t" + "paddw %%mm5, %%mm1 \n\t" //'t12 + + "movq %%mm4, "DCTSIZE_S"*0*2(%%edi) \n\t" + "psubw %%mm7, %%mm1 \n\t" //'t6 + + "movq "MANGLE(temps)"+2*8, %%mm7 \n\t" + "psubw %%mm5, %%mm0 \n\t" //'t10 + + "movq "MANGLE(temps)"+3*8, %%mm6 \n\t" + "movq %%mm3, %%mm5 \n\t" + + "paddw "DCTSIZE_S"*1*2(%%edi), %%mm3 \n\t" + "psubw %%mm1, %%mm5 \n\t" + + "psubw %%mm1, %%mm2 \n\t" //'t5 + "paddw %%mm1, %%mm3 \n\t" + + "movq %%mm5, "DCTSIZE_S"*6*2(%%edi) \n\t" + "movq %%mm7, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*2*2(%%edi), %%mm7 \n\t" + "psubw %%mm2, %%mm4 \n\t" + + "paddw "DCTSIZE_S"*5*2(%%edi), %%mm4 \n\t" + "paddw %%mm2, %%mm7 \n\t" + + "movq %%mm3, "DCTSIZE_S"*1*2(%%edi) \n\t" + "paddw %%mm2, %%mm0 \n\t" //'t4 + + // 't4 't6 't5 - - - - 't7 + "movq %%mm7, "DCTSIZE_S"*2*2(%%edi) \n\t" + "movq %%mm6, %%mm1 \n\t" + + "paddw "DCTSIZE_S"*4*2(%%edi), %%mm6 \n\t" + "psubw %%mm0, %%mm1 \n\t" + + "paddw "DCTSIZE_S"*3*2(%%edi), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "movq %%mm4, "DCTSIZE_S"*5*2(%%edi) \n\t" + "addl $24, %%esi \n\t" + + "movq %%mm6, "DCTSIZE_S"*4*2(%%edi) \n\t" + + "movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t" + "addl $24, %%edi \n\t" + "subl $2, %%ecx \n\t" + "jnz L_COLUMSb \n\t" +"L_END: \n\t" + + : + : "S"(data), "D"(output), "c"(cnt), "d"(thr_adr)// input regs + : "%eax", "%ecx", "%edx", "%edi", "%esi" + ); +} + + +static void row_idct_c(DCTELEM* workspace, + int16_t* output_adr, int output_stride, int cnt) +{ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z5, z10, z11, z12, z13; + int16_t* outptr; + DCTELEM* wsptr; + + cnt*=4; + wsptr = workspace; + outptr = output_adr; + for (; cnt > 0; cnt--) { + // Even part + //Simd version reads 4x4 block and transposes it + tmp10 = ( wsptr[2] + wsptr[3]); + tmp11 = ( wsptr[2] - wsptr[3]); + + tmp13 = ( wsptr[0] + wsptr[1]); + tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow + + tmp0 = tmp10 + tmp13; //->temps + tmp3 = tmp10 - tmp13; //->temps + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + //Also transpose, with previous: + // ---- ---- |||| + // ---- ---- idct |||| + // ---- ---- ---> |||| + // ---- ---- |||| + z13 = wsptr[4] + wsptr[5]; + z10 = wsptr[4] - wsptr[5]; + z11 = wsptr[6] + wsptr[7]; + z12 = wsptr[6] - wsptr[7]; + + tmp7 = z11 + z13; + tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562); + + z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY16H(z10, - FIX_2_613125930) + z5; + + tmp6 = (tmp12<<3) - tmp7; + tmp5 = (tmp11<<3) - tmp6; + tmp4 = (tmp10<<3) + tmp5; + + // Final output stage: descale and write column + outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3); + outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3); + outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3); + outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3); + outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3); + outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3); + outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ? + outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ? + outptr++; + + wsptr += DCTSIZE; // advance pointer to next row + } +} + +static void row_idct_mmx (DCTELEM* workspace, + int16_t* output_adr, int output_stride, int cnt) +{//uint64_t temps[2]; + asm volatile( + "leal (%%eax,%%eax,2), %%edx \n\t" +"L_ROWSi: \n\t" + "movq "DCTSIZE_S"*0*2(%%esi), %%mm0 \n\t" + // + + "movq "DCTSIZE_S"*1*2(%%esi), %%mm1 \n\t" + "movq %%mm0, %%mm4 \n\t" + + "movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + + "movq "DCTSIZE_S"*3*2(%%esi), %%mm3 \n\t" + "punpckhwd %%mm1, %%mm4 \n\t" + + //transpose 4x4 + "movq %%mm2, %%mm7 \n\t" + "punpcklwd %%mm3, %%mm2 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "punpckldq %%mm2, %%mm0 \n\t" //0 + + "punpckhdq %%mm2, %%mm6 \n\t" //1 + "movq %%mm0, %%mm5 \n\t" + + "punpckhwd %%mm3, %%mm7 \n\t" + "psubw %%mm6, %%mm0 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t" + "movq %%mm4, %%mm2 \n\t" + + "punpckldq %%mm7, %%mm4 \n\t" //2 + "paddw %%mm6, %%mm5 \n\t" + + "punpckhdq %%mm7, %%mm2 \n\t" //3 + "movq %%mm4, %%mm1 \n\t" + + "psllw $2, %%mm0 \n\t" + "paddw %%mm2, %%mm4 \n\t" //t10 + + "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%esi), %%mm3 \n\t" + "psubw %%mm2, %%mm1 \n\t" //t11 + + "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%esi), %%mm2 \n\t" + "psubw %%mm5, %%mm0 \n\t" + + "movq %%mm4, %%mm6 \n\t" + "paddw %%mm5, %%mm4 \n\t" //t0 + + "psubw %%mm5, %%mm6 \n\t" //t3 + "movq %%mm1, %%mm7 \n\t" + + "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%esi), %%mm5 \n\t" + "paddw %%mm0, %%mm1 \n\t" //t1 + + "movq %%mm4, "MANGLE(temps)"+0*8 \n\t" //t0 + "movq %%mm3, %%mm4 \n\t" + + "movq %%mm6, "MANGLE(temps)"+1*8 \n\t" //t3 + "punpcklwd %%mm2, %%mm3 \n\t" + + //transpose 4x4 + "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%esi), %%mm6 \n\t" + "punpckhwd %%mm2, %%mm4 \n\t" + + "movq %%mm5, %%mm2 \n\t" + "punpcklwd %%mm6, %%mm5 \n\t" + + "psubw %%mm0, %%mm7 \n\t" //t2 + "punpckhwd %%mm6, %%mm2 \n\t" + + "movq %%mm3, %%mm0 \n\t" + "punpckldq %%mm5, %%mm3 \n\t" //4 + + "punpckhdq %%mm5, %%mm0 \n\t" //5 + "movq %%mm4, %%mm5 \n\t" + + // + "movq %%mm3, %%mm6 \n\t" + "punpckldq %%mm2, %%mm4 \n\t" //6 + + "psubw %%mm0, %%mm3 \n\t" //z10 + "punpckhdq %%mm2, %%mm5 \n\t" //7 + + "paddw %%mm0, %%mm6 \n\t" //z13 + "movq %%mm4, %%mm2 \n\t" + + "movq %%mm3, %%mm0 \n\t" + "psubw %%mm5, %%mm4 \n\t" //z12 + + "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //- + "paddw %%mm4, %%mm3 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5 + "paddw %%mm5, %%mm2 \n\t" //z11 > + + "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + + "psubw %%mm6, %%mm2 \n\t" + "paddw %%mm6, %%mm5 \n\t" //t7 + + "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11 + "paddw %%mm3, %%mm0 \n\t" //t12 + + "psllw $3, %%mm0 \n\t" + "psubw %%mm3, %%mm4 \n\t" //t10 + + "movq "MANGLE(temps)"+0*8, %%mm6 \n\t" + "movq %%mm1, %%mm3 \n\t" + + "psllw $3, %%mm4 \n\t" + "psubw %%mm5, %%mm0 \n\t" //t6 + + "psllw $3, %%mm2 \n\t" + "paddw %%mm0, %%mm1 \n\t" //d1 + + "psubw %%mm0, %%mm2 \n\t" //t5 + "psubw %%mm0, %%mm3 \n\t" //d6 + + "paddw %%mm2, %%mm4 \n\t" //t4 + "movq %%mm7, %%mm0 \n\t" + + "paddw %%mm2, %%mm7 \n\t" //d2 + "psubw %%mm2, %%mm0 \n\t" //d5 + + "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4 + "psubw %%mm5, %%mm6 \n\t" //d7 + + "paddw "MANGLE(temps)"+0*8, %%mm5 \n\t" //d0 + "paddw %%mm2, %%mm1 \n\t" + + "paddw %%mm2, %%mm5 \n\t" + "psraw $3, %%mm1 \n\t" + + "paddw %%mm2, %%mm7 \n\t" + "psraw $3, %%mm5 \n\t" + + "paddw (%%edi), %%mm5 \n\t" + "psraw $3, %%mm7 \n\t" + + "paddw (%%edi,%%eax,), %%mm1 \n\t" + "paddw %%mm2, %%mm0 \n\t" + + "paddw (%%edi,%%eax,2), %%mm7 \n\t" + "paddw %%mm2, %%mm3 \n\t" + + "movq %%mm5, (%%edi) \n\t" + "paddw %%mm2, %%mm6 \n\t" + + "movq %%mm1, (%%edi,%%eax,) \n\t" + "psraw $3, %%mm0 \n\t" + + "movq %%mm7, (%%edi,%%eax,2) \n\t" + "addl %%edx, %%edi \n\t" //3*ls + + "movq "MANGLE(temps)"+1*8, %%mm5 \n\t" //t3 + "psraw $3, %%mm3 \n\t" + + "paddw (%%edi,%%eax,2), %%mm0 \n\t" + "psubw %%mm4, %%mm5 \n\t" //d3 + + "paddw (%%edi,%%edx,), %%mm3 \n\t" + "psraw $3, %%mm6 \n\t" + + "paddw "MANGLE(temps)"+1*8, %%mm4 \n\t" //d4 + "paddw %%mm2, %%mm5 \n\t" + + "paddw (%%edi,%%eax,4), %%mm6 \n\t" + "paddw %%mm2, %%mm4 \n\t" + + "movq %%mm0, (%%edi,%%eax,2) \n\t" + "psraw $3, %%mm5 \n\t" + + "paddw (%%edi), %%mm5 \n\t" + "psraw $3, %%mm4 \n\t" + + "paddw (%%edi,%%eax,), %%mm4 \n\t" + "addl "DCTSIZE_S"*2*4, %%esi \n\t" //4 rows + + "movq %%mm3, (%%edi,%%edx,) \n\t" + "movq %%mm6, (%%edi,%%eax,4) \n\t" + "movq %%mm5, (%%edi) \n\t" + "movq %%mm4, (%%edi,%%eax,) \n\t" + + "subl %%edx, %%edi \n\t" + "addl $8, %%edi \n\t" + "decl %%ecx \n\t" + "jnz L_ROWSib \n\t" + + : + : "S"(workspace), "D"(output_adr), "a"(output_stride*sizeof(short)), "c"(cnt) //input regs + : "%eax", "%ecx", "%edx", "%edi", "%esi" + ); +} + + +static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt) +{ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z1, z2, z3, z4, z5, z11, z13; + DCTELEM *dataptr; + + cnt*=4; + // Pass 1: process rows. + + dataptr = data; + for (; cnt > 0; cnt--) { + tmp0 = pixels[line_size*0] + pixels[line_size*7]; + tmp7 = pixels[line_size*0] - pixels[line_size*7]; + tmp1 = pixels[line_size*1] + pixels[line_size*6]; + tmp6 = pixels[line_size*1] - pixels[line_size*6]; + tmp2 = pixels[line_size*2] + pixels[line_size*5]; + tmp5 = pixels[line_size*2] - pixels[line_size*5]; + tmp3 = pixels[line_size*3] + pixels[line_size*4]; + tmp4 = pixels[line_size*3] - pixels[line_size*4]; + + // Even part + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + //Even columns are written first, this leads to different order of columns + //in column_fidct(), but they are processed independently, so all ok. + //Later in the row_idct() columns readed at the same order. + dataptr[2] = tmp10 + tmp11; + dataptr[3] = tmp10 - tmp11; + + z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781); + dataptr[0] = tmp13 + z1; + dataptr[1] = tmp13 - z1; + + // Odd part + + tmp10 = (tmp4 + tmp5) <<2; + tmp11 = (tmp5 + tmp6) <<2; + tmp12 = (tmp6 + tmp7) <<2; + + z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433); + z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5; + z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5; + z3 = MULTIPLY16H(tmp11, FIX_0_707106781); + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + dataptr[4] = z13 + z2; + dataptr[5] = z13 - z2; + dataptr[6] = z11 + z4; + dataptr[7] = z11 - z4; + + pixels++; // advance pointer to next column + dataptr += DCTSIZE; + } +} + + +static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt) +{//uint64_t temps[2]; + asm volatile( + "leal (%%eax,%%eax,2), %%edx \n\t" +"L_ROWS: \n\t" + "movd (%%esi), %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + + "movd (%%esi,%%eax,), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + + "movd (%%esi,%%eax,2), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + + "punpcklbw %%mm7, %%mm2 \n\t" + "addl %%edx, %%esi \n\t" + + "movq %%mm0, %%mm5 \n\t" + // + + "movd (%%esi,%%eax,4), %%mm3 \n\t" //7 ;prefetch! + "movq %%mm1, %%mm6 \n\t" + + "movd (%%esi,%%edx,), %%mm4 \n\t" //6 + "punpcklbw %%mm7, %%mm3 \n\t" + + "psubw %%mm3, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + + "paddw %%mm3, %%mm0 \n\t" + "psubw %%mm4, %%mm6 \n\t" + + "movd (%%esi,%%eax,2), %%mm3 \n\t" //5 + "paddw %%mm4, %%mm1 \n\t" + + "movq %%mm5, "MANGLE(temps)"+0*8 \n\t" //t7 + "punpcklbw %%mm7, %%mm3 \n\t" + + "movq %%mm6, "MANGLE(temps)"+1*8 \n\t" //t6 + "movq %%mm2, %%mm4 \n\t" + + "movd (%%esi), %%mm5 \n\t" //3 + "paddw %%mm3, %%mm2 \n\t" + + "movd (%%esi,%%eax,), %%mm6 \n\t" //4 + "punpcklbw %%mm7, %%mm5 \n\t" + + "psubw %%mm3, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + + "movq %%mm5, %%mm3 \n\t" + "paddw %%mm6, %%mm5 \n\t" //t3 + + "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - - + "movq %%mm0, %%mm6 \n\t" + + "movq %%mm1, %%mm7 \n\t" + "psubw %%mm5, %%mm0 \n\t" //t13 + + "psubw %%mm2, %%mm1 \n\t" + "paddw %%mm2, %%mm7 \n\t" //t11 + + "paddw %%mm0, %%mm1 \n\t" + "movq %%mm7, %%mm2 \n\t" + + "psllw $2, %%mm1 \n\t" + "paddw %%mm5, %%mm6 \n\t" //t10 + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t" + "paddw %%mm6, %%mm7 \n\t" //d2 + + "psubw %%mm2, %%mm6 \n\t" //d3 + "movq %%mm0, %%mm5 \n\t" + + //transpose 4x4 + "movq %%mm7, %%mm2 \n\t" + "punpcklwd %%mm6, %%mm7 \n\t" + + "paddw %%mm1, %%mm0 \n\t" //d0 + "punpckhwd %%mm6, %%mm2 \n\t" + + "psubw %%mm1, %%mm5 \n\t" //d1 + "movq %%mm0, %%mm6 \n\t" + + "movq "MANGLE(temps)"+1*8, %%mm1 \n\t" + "punpcklwd %%mm5, %%mm0 \n\t" + + "punpckhwd %%mm5, %%mm6 \n\t" + "movq %%mm0, %%mm5 \n\t" + + "punpckldq %%mm7, %%mm0 \n\t" //0 + "paddw %%mm4, %%mm3 \n\t" + + "punpckhdq %%mm7, %%mm5 \n\t" //1 + "movq %%mm6, %%mm7 \n\t" + + "movq %%mm0, "DCTSIZE_S"*0*2(%%edi) \n\t" + "punpckldq %%mm2, %%mm6 \n\t" //2 + + "movq %%mm5, "DCTSIZE_S"*1*2(%%edi) \n\t" + "punpckhdq %%mm2, %%mm7 \n\t" //3 + + "movq %%mm6, "DCTSIZE_S"*2*2(%%edi) \n\t" + "paddw %%mm1, %%mm4 \n\t" + + "movq %%mm7, "DCTSIZE_S"*3*2(%%edi) \n\t" + "psllw $2, %%mm3 \n\t" //t10 + + "movq "MANGLE(temps)"+0*8, %%mm2 \n\t" + "psllw $2, %%mm4 \n\t" //t11 + + "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" //z3 + "paddw %%mm2, %%mm1 \n\t" + + "psllw $2, %%mm1 \n\t" //t12 + "movq %%mm3, %%mm0 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t" + "psubw %%mm1, %%mm3 \n\t" + + "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5 + "movq %%mm2, %%mm5 \n\t" + + "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t" + "psubw %%mm4, %%mm2 \n\t" //z13 + + "paddw %%mm4, %%mm5 \n\t" //z11 + "movq %%mm2, %%mm6 \n\t" + + "paddw %%mm3, %%mm0 \n\t" //z2 + "movq %%mm5, %%mm7 \n\t" + + "paddw %%mm0, %%mm2 \n\t" //d4 + "psubw %%mm0, %%mm6 \n\t" //d5 + + "movq %%mm2, %%mm4 \n\t" + "paddw %%mm3, %%mm1 \n\t" //z4 + + //transpose 4x4 + "punpcklwd %%mm6, %%mm2 \n\t" + "paddw %%mm1, %%mm5 \n\t" //d6 + + "punpckhwd %%mm6, %%mm4 \n\t" + "psubw %%mm1, %%mm7 \n\t" //d7 + + "movq %%mm5, %%mm6 \n\t" + "punpcklwd %%mm7, %%mm5 \n\t" + + "punpckhwd %%mm7, %%mm6 \n\t" + "movq %%mm2, %%mm7 \n\t" + + "punpckldq %%mm5, %%mm2 \n\t" //4 + "subl %%edx, %%esi \n\t" + + "punpckhdq %%mm5, %%mm7 \n\t" //5 + "movq %%mm4, %%mm5 \n\t" + + "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%edi) \n\t" + "punpckldq %%mm6, %%mm4 \n\t" //6 + + "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%edi) \n\t" + "punpckhdq %%mm6, %%mm5 \n\t" //7 + + "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%edi) \n\t" + "addl $4, %%esi \n\t" + + "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%edi) \n\t" + "addl "DCTSIZE_S"*2*4, %%edi \n\t" //4 rows + "decl %%ecx \n\t" + "jnz L_ROWSb \n\t" + + : + : "S"(pixels), "D"(data), "a"(line_size), "c"(cnt) //input regs + : "%eax", "%ecx", "%edx", "%edi", "%esi"); +} #endif //USE_LIBAVCODEC