--- vf_spp.c.nikola 2005-06-02 20:52:05.000000000 +0200 +++ vf_spp.c 2005-06-03 08:55:22.974137452 +0200 @@ -171,7 +171,7 @@ #define row_idct_s row_idct_c #define row_fdct_s row_fdct_c -#else //MMX functions +#else /* HAVE_MMX */ //This func reads from 1 slice, 1 and clears 0 & 1 static void store_slice_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale) @@ -229,9 +229,9 @@ "cmpl %4, %%edx \n\t" "jl 2b \n\t" + : "+m" (width), "+m" (src_stride), "+g" (od), "+m" (dst_stride), "+g" (height), + "+d" (log2_scale), "+S" (src), "+D" (dst) //input : - : "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (height), - "d" (log2_scale), "S" (src), "D" (dst) //input : "%ecx", "%eax" ); } @@ -292,9 +292,9 @@ "cmpl %4, %%edx \n\t" "jl 2b \n\t" + : "+m" (width), "+m" (src_stride), "+g" (od), "+m" (dst_stride), "+g" (height), + "+d" (log2_scale), "+S" (src), "+D" (dst) //input : - : "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (height), - "d" (log2_scale), "S" (src), "D" (dst) //input : "%ecx", "%eax" ); } @@ -371,9 +371,8 @@ "movq %%mm0, 14*8+0*8(%%edi) \n\t" "movq %%mm1, 14*8+1*8(%%edi) \n\t" + : "+g" (q), "+S" (adr), "+D" (adr) : - : "g" (q), "S" (adr), "D" (adr) - //: ); } @@ -388,7 +387,7 @@ #define column_fidct_s column_fidct_mmx #define row_idct_s row_idct_mmx #define row_fdct_s row_fdct_mmx -#endif +#endif // HAVE_MMX static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){ @@ -401,6 +400,8 @@ DCTELEM *block= (DCTELEM *)block_align; DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ); + memset(block3, 0, 4*8*BLOCKSZ); + //p->src=src-src_stride*8-8;//! if (!src || !dst) return; // HACK avoid crash for Y8 colourspace for(y=0; ytemp+ 8 +8*stride, + if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride, dst_stride, stride, width, y&7, 5-p->log2_count); - else store_slice2_s(dst + (y-8&~7)*dst_stride, p->temp+ 8 +0*stride, + else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride, dst_stride, stride, width, y&7, 5-p->log2_count); } } @@ -472,7 +473,7 @@ int h= (height+16+15)&(~15); vf->priv->temp_stride= (width+16+15)&(~15); - vf->priv->temp= (int16_t*)av_malloc(vf->priv->temp_stride*3*8*sizeof(int16_t)); + vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t)); //this can also be avoided, see above vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t)); @@ -564,6 +565,7 @@ return 0; } +/* static unsigned int fmt_list[]={ IMGFMT_YVU9, IMGFMT_IF09, @@ -578,6 +580,7 @@ IMGFMT_411P, 0 }; +*/ static int control(struct vf_instance_s* vf, int request, void* data){ switch(request){ @@ -667,40 +670,47 @@ #define MULTIPLY16H(x,k) (((x)*(k))>>16) #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0; -#define DESCALE(x,n) ((x) + (1 << ((n)-1)) >> n) - -static uint64_t temps[4];//!! +#define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n) -static const uint64_t MM_FIX_0_382683433=FIX64(0.382683433, 14); -static const uint64_t MM_FIX_0_541196100=FIX64(0.541196100, 14); -static const uint64_t MM_FIX_0_707106781=FIX64(0.707106781, 14); -static const uint64_t MM_FIX_1_306562965=FIX64(1.306562965, 14); +#ifdef HAVE_MMX -static const uint64_t MM_FIX_1_414213562_A=FIX64(1.414213562, 14); +static uint64_t attribute_used __attribute__((aligned(8))) temps[4];//!! -static const uint64_t MM_FIX_1_847759065=FIX64(1.847759065, 13); -static const uint64_t MM_FIX_2_613125930=FIX64(-2.613125930, 13); //- -static const uint64_t MM_FIX_1_414213562=FIX64(1.414213562, 13); -static const uint64_t MM_FIX_1_082392200=FIX64(1.082392200, 13); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_382683433=FIX64(0.382683433, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_541196100=FIX64(0.541196100, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_707106781=FIX64(0.707106781, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_1_306562965=FIX64(1.306562965, 14); + +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_1_414213562_A=FIX64(1.414213562, 14); + +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_1_847759065=FIX64(1.847759065, 13); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_2_613125930=FIX64(-2.613125930, 13); //- +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_1_414213562=FIX64(1.414213562, 13); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_1_082392200=FIX64(1.082392200, 13); //for t3,t5,t7 == 0 shortcut -static const uint64_t MM_FIX_0_847759065=FIX64(0.847759065, 14); -static const uint64_t MM_FIX_0_566454497=FIX64(0.566454497, 14); -static const uint64_t MM_FIX_0_198912367=FIX64(0.198912367, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_847759065=FIX64(0.847759065, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_566454497=FIX64(0.566454497, 14); +static uint64_t attribute_used __attribute__((aligned(8))) MM_FIX_0_198912367=FIX64(0.198912367, 14); -static const uint64_t MM_DESCALE_RND=C64(4); -static const uint64_t MM_2=C64(2); +static uint64_t attribute_used __attribute__((aligned(8))) MM_DESCALE_RND=C64(4); +static uint64_t attribute_used __attribute__((aligned(8))) MM_2=C64(2); +#else /* !HAVE_MMX */ typedef int32_t int_simd16_t; -static const int16_t FIX_0_382683433=FIX(0.382683433, 14); -static const int16_t FIX_0_541196100=FIX(0.541196100, 14); -static const int16_t FIX_0_707106781=FIX(0.707106781, 14); -static const int16_t FIX_1_306562965=FIX(1.306562965, 14); -static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14); -static const int16_t FIX_1_847759065=FIX(1.847759065, 13); -static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //- -static const int16_t FIX_1_414213562=FIX(1.414213562, 13); -static const int16_t FIX_1_082392200=FIX(1.082392200, 13); +static int16_t FIX_0_382683433=FIX(0.382683433, 14); +static int16_t FIX_0_541196100=FIX(0.541196100, 14); +static int16_t FIX_0_707106781=FIX(0.707106781, 14); +static int16_t FIX_1_306562965=FIX(1.306562965, 14); +static int16_t FIX_1_414213562_A=FIX(1.414213562, 14); +static int16_t FIX_1_847759065=FIX(1.847759065, 13); +static int16_t FIX_2_613125930=FIX(-2.613125930, 13); //- +static int16_t FIX_1_414213562=FIX(1.414213562, 13); +static int16_t FIX_1_082392200=FIX(1.082392200, 13); + +#endif + +#ifndef HAVE_MMX static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt) {int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -824,11 +834,13 @@ } } +#else /* HAVE_MMX */ + static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt) {//uint64_t temps[4]; asm volatile( ".align 16 \n\t" -"L_COLUMS: \n\t" +"1: \n\t" "movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t" // "movq "DCTSIZE_S"*3*2(%%esi), %%mm7 \n\t" @@ -1020,7 +1032,7 @@ "movd %%mm4, %%eax \n\t" "orl %%eax, %%eax \n\t" - "jnz L_NDC21 \n\t" + "jnz 2f \n\t" //movq [edi+"DCTSIZE_S"*3*2], mm1 //movq [edi+"DCTSIZE_S"*5*2], mm5 //movq [edi+"DCTSIZE_S"*1*2], mm0 @@ -1090,9 +1102,9 @@ "movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t" "addl $8, %%edi \n\t" - "jmp L_END1 \n\t" + "jmp 4f \n\t" -"L_NDC21: \n\t" +"2: \n\t" //--- non DC2 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) //psraw mm5, 2 @@ -1185,7 +1197,7 @@ "movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t" "addl $8, %%edi \n\t" -"L_END1: \n\t" +"4: \n\t" //=part 2 (the same)=========================================================== "movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t" // @@ -1378,7 +1390,7 @@ "movd %%mm4, %%eax \n\t" "orl %%eax, %%eax \n\t" - "jnz L_NDC22 \n\t" + "jnz 3f \n\t" //movq [edi+"DCTSIZE_S"*3*2], mm1 //movq [edi+"DCTSIZE_S"*5*2], mm5 //movq [edi+"DCTSIZE_S"*1*2], mm0 @@ -1449,10 +1461,10 @@ "movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t" "addl $24, %%edi \n\t" "subl $2, %%ecx \n\t" - "jnz L_COLUMS \n\t" - "jmp L_END \n\t" + "jnz 1b \n\t" + "jmp 5f \n\t" -"L_NDC22: \n\t" +"3: \n\t" //--- non DC2 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) //psraw mm5, 2 @@ -1545,15 +1557,18 @@ "movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t" "addl $24, %%edi \n\t" "subl $2, %%ecx \n\t" - "jnz L_COLUMS \n\t" -"L_END: \n\t" + "jnz 1b \n\t" +"5: \n\t" + : "+S"(data), "+D"(output), "+c"(cnt), "+d"(thr_adr)// input regs : - : "S"(data), "D"(output), "c"(cnt), "d"(thr_adr)// input regs : "%eax" ); } +#endif // HAVE_MMX + +#ifndef HAVE_MMX static void row_idct_c(DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt) @@ -1617,12 +1632,14 @@ } } +#else /* HAVE_MMX */ + static void row_idct_mmx (DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt) {//uint64_t temps[2]; asm volatile( "leal (%%eax,%%eax,2), %%edx \n\t" -"L_ROWSi: \n\t" +"1: \n\t" "movq "DCTSIZE_S"*0*2(%%esi), %%mm0 \n\t" // @@ -1808,14 +1825,17 @@ "subl %%edx, %%edi \n\t" "addl $8, %%edi \n\t" "decl %%ecx \n\t" - "jnz L_ROWSi \n\t" + "jnz 1b \n\t" - : - : "S"(workspace), "D"(output_adr), "a"(output_stride*sizeof(short)), "c"(cnt) //input regs + : "+S"(workspace), "+D"(output_adr), "+c"(cnt) //input regs + : "a"(output_stride*sizeof(short)) : "%edx" ); } +#endif // HAVE_MMX + +#ifndef HAVE_MMX static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -1877,12 +1897,13 @@ } } +#else /* HAVE_MMX */ static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt) -{//uint64_t temps[2]; +{ asm volatile( "leal (%%eax,%%eax,2), %%edx \n\t" -"1: \n\t" +"6: \n\t" "movd (%%esi), %%mm0 \n\t" "pxor %%mm7, %%mm7 \n\t" @@ -2047,11 +2068,13 @@ "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%edi) \n\t" "addl $"DCTSIZE_S"*2*4, %%edi \n\t" //4 rows "decl %%ecx \n\t" - "jnz 1b \n\t" + "jnz 6b \n\t" - : - : "S"(pixels), "D"(data), "a"(line_size), "c"(cnt) //input regs - : "%edx" ); + : "+S"(pixels), "+D"(data), "+a"(line_size), "+c"(cnt) //input regs + : + : "%edx"); } -#endif //USE_LIBAVCODEC \ No newline at end of file +#endif // HAVE_MMX + +#endif //USE_LIBAVCODEC