[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Robert Edele
yartrebo
Tue Mar 7 23:12:38 CET 2006
On Mon, 2006-03-06 at 02:06 +0100, Michael Niedermayer wrote:
> Hi
>
> On Sun, Mar 05, 2006 at 06:09:09PM -0500, Robert Edele wrote:
> [...]
> > With the help of ods15, we have done the following:
> > - the asm code now resides entirely in dsputil_mmx.c.
> > - snow_mmx_sse2.h is now gone
> > - code previously in snow.c and all of snow_mmx_sse2.h is now in
> > dsputil_mxx.c, dsputil.c, and dsputil.h.
> > - snow calls the asm via dsputil function pointers.
> >
> > If you have any further issues with this code, please let me know.
>
> it looks much better then before, but
> please move the stuff from dsputil_mmx.c to snowdsp_mmx.c
> this should be just a copy&paste + Makefile update
>
Fixed as suggested.
>
> [...]
> > -static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
> > +void ff_snow_vertical_compose97i(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> > + DWTELEM *b0 = vb0;
> > + DWTELEM *b1 = vb1;
> > + DWTELEM *b2 = vb2;
> > + DWTELEM *b3 = vb3;
> > + DWTELEM *b4 = vb4;
> > + DWTELEM *b5 = vb5;
>
> move DWTELEM to dsputil.h or anything else but please not that mess
>
moved to snow.h
>
> [...]
> > @@ -2545,6 +2620,41 @@
> > }
> > }
> >
> > +void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > + int src_x, int src_y, int src_stride, void * vsb, int add, uint8_t * dst8){
> > + slice_buffer * sb = vsb;
>
> uhm...
>
> put
> typdef struct slice_buffer_s slice_buffer;
> in dsputil.c or wherever its needed, and
> struct slice_buffe_s { ... }; in snow.c
>
>
slice_buffer struct definition put into snow.h.
>
>
> [...]
> > Index: i386/dsputil_mmx.c
> > ===================================================================
> > RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
> > retrieving revision 1.111
> > diff -u -r1.111 dsputil_mmx.c
> > --- i386/dsputil_mmx.c 10 Feb 2006 06:55:25 -0000 1.111
> > +++ i386/dsputil_mmx.c 5 Mar 2006 17:31:12 -0000
> > @@ -2564,6 +2564,1518 @@
> > }
> > #endif
> >
> > +/* snow wavelet */
> > +#define DWTELEM int
> > +#define W_AM 3
> > +#define W_AO 0
> > +#define W_AS 1
> > +
> > +#define W_BM 1
> > +#define W_BO 8
> > +#define W_BS 4
> > +
> > +#define W_CM 1
> > +#define W_CO 0
> > +#define W_CS 0
> > +
> > +#define W_DM 3
> > +#define W_DO 4
> > +#define W_DS 3
> > +
> > +#ifdef ARCH_X86_64
> > +#define PTR_SIZE "8"
> > +#else
> > +#define PTR_SIZE "4"
> > +#endif
> > +
> > +/** Used to minimize the amount of memory used in order to optimize cache performance. **/
> > +typedef struct {
> > + DWTELEM * * line; ///< For use by idwt and predict_slices.
> > + DWTELEM * * data_stack; ///< Used for internal purposes.
> > + int data_stack_top;
> > + int line_count;
> > + int line_width;
> > + int data_count;
> > + DWTELEM * base_buffer; ///< Buffer that this structure is caching.
> > +} slice_buffer;
>
> duplicating #defines and structs is not accpetable, these should be in a
> common header
>
Offending lines removed, snow.h is now our common header.
>
> > +
> > +#define snow_interleave_line_header(low,b,width)\
> > + int i = (width) - 2;\
> > + \
> > + if ((width) & 1)\
> > + {\
> > + (b)[i+1] = (low)[(i+1)>>1];\
> > + i--;\
> > + }
> > +
> > +#define snow_interleave_line_footer(low,high,b)\
> > + for (; i>=0; i-=2){\
> > + (b)[i+1] = (high)[i>>1];\
> > + (b)[i] = (low)[i>>1];\
> > + }
>
> these should be inline functions
>
Fixed as suggested.
>
> > +
> > +static void horizontal_compose97i_sse2(void *vb, int width){
> > + DWTELEM *b = vb;
> > + const int w2= (width+1)>>1;
> > + // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
> > + DWTELEM temp_buf[width>>1];
> > + DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) / 4);
>
> replace /4 by >>2 or make type unsigned divides by 4 and signed is slow
>
Fixed using >>2.
>
> [...]
> > + for(; i<w_l; i++){
> > + b[i] = b[i] - ((W_DM * (ref[i] + ref[i + 1]) + W_DO) >> W_DS);
> > + }
> > +
> > + if(width&1){
> > + b[w_l] = b[w_l] - ((W_DM * 2 * ref[w_l] + W_DO) >> W_DS);
> > + }
> [...]
> > + for(; i<w_r; i++){
> > + dst[i] = dst[i] - (b[i] + b[i + 1]);
> > + }
> > +
> > + if(!(width&1)){
> > + dst[w_r] = dst[w_r] - (2 * b[w_r]);
> > + }
> [...]
> > + for(; i<w_l; i++){
> > + b[i] = b[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4*b[i])>>W_BS);
> > + }
> > +
> > + if(width&1){
> > + b[w_l] = b[w_l] - (((-2 * ref[w_l] + W_BO) - 4 * b[w_l]) >> W_BS);
> > + }
> ...
>
> replace this with a function, see the lift() function in snow.c on how if its
> not obvious
>
> same applies to the other such cases
>
2 cases fixed. If there are any more that I missed, please inform me.
>
> [...]
> > +static void vertical_compose97i_sse2(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> > + DWTELEM *b0 = vb0;
> > + DWTELEM *b1 = vb1;
> > + DWTELEM *b2 = vb2;
> > + DWTELEM *b3 = vb3;
> > + DWTELEM *b4 = vb4;
> > + DWTELEM *b5 = vb5;
> > + int i;
> > + int end_w2 = width >> 4; /* Needed because GCC does something totally brain dead and mis-loads end_w into the asm code if I use end_w directly.*/
> > +
> > + asm volatile (
> > + "sal $4, %%"REG_d" \n\t"
> > + "jmp 2f \n\t"
> > + "1: \n\t"
> > +
> > + "mov %5, %%"REG_a" \n\t"
> > + "mov %3, %%"REG_b" \n\t"
> > +
> > + "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> > + "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> > + "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> > + "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> > + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> > + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> > + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "movdqa %%xmm0, %%xmm1 \n\t"
> > + "movdqa %%xmm2, %%xmm3 \n\t"
> > + "movdqa %%xmm4, %%xmm5 \n\t"
> > + "movdqa %%xmm6, %%xmm7 \n\t"
> > +
> > + "pslld $1, %%xmm0 \n\t"
> > + "pslld $1, %%xmm2 \n\t"
> > + "pslld $1, %%xmm4 \n\t"
> > + "pslld $1, %%xmm6 \n\t"
> > +
> > + "paddd %%xmm1, %%xmm0 \n\t"
> > + "paddd %%xmm3, %%xmm2 \n\t"
> > + "paddd %%xmm5, %%xmm4 \n\t"
> > + "paddd %%xmm7, %%xmm6 \n\t"
> > +
> > + "pcmpeqd %%xmm1, %%xmm1 \n\t"
> > + "pslld $31, %%xmm1 \n\t"
> > + "psrld $29, %%xmm1 \n\t"
> > + "mov %4, %%"REG_a" \n\t"
> > +
> > + "paddd %%xmm1, %%xmm0 \n\t"
> > + "paddd %%xmm1, %%xmm2 \n\t"
> > + "paddd %%xmm1, %%xmm4 \n\t"
> > + "paddd %%xmm1, %%xmm6 \n\t"
> > +
> > + "psrad $3, %%xmm0 \n\t"
> > + "psrad $3, %%xmm2 \n\t"
> > + "psrad $3, %%xmm4 \n\t"
> > + "psrad $3, %%xmm6 \n\t"
> > +
> > + "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1 \n\t"
> > + "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3 \n\t"
> > + "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5 \n\t"
> > + "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7 \n\t"
> > +
> > + "psubd %%xmm0, %%xmm1 \n\t"
> > + "psubd %%xmm2, %%xmm3 \n\t"
> > + "psubd %%xmm4, %%xmm5 \n\t"
> > + "psubd %%xmm6, %%xmm7 \n\t"
> > +
> > + "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4) \n\t"
> > +
> > + "mov %2, %%"REG_c" \n\t"
> > +
> > + "paddd (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
> > + "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
> > + "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
> > + "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
> > +
> > + "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> > + "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> > + "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> > + "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "psubd %%xmm1, %%xmm0 \n\t"
> > + "psubd %%xmm3, %%xmm2 \n\t"
> > + "psubd %%xmm5, %%xmm4 \n\t"
> > + "psubd %%xmm7, %%xmm6 \n\t"
> > +
> > + "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4) \n\t"
> > +
> > + "mov %1, %%"REG_a" \n\t"
> > +
> > + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> > + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> > + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> > + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
> > + "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
> > + "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
> > + "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
> > +
> > + "pslld $2, %%xmm1 \n\t"
> > + "pslld $2, %%xmm3 \n\t"
> > + "pslld $2, %%xmm5 \n\t"
> > + "pslld $2, %%xmm7 \n\t"
> > +
> > + "paddd %%xmm1, %%xmm0 \n\t"
> > + "paddd %%xmm3, %%xmm2 \n\t"
> > + "paddd %%xmm5, %%xmm4 \n\t"
> > + "paddd %%xmm7, %%xmm6 \n\t"
> > +
> > + "pcmpeqd %%xmm1, %%xmm1 \n\t"
> > + "pslld $31, %%xmm1 \n\t"
> > + "psrld $28, %%xmm1 \n\t"
> > + "mov %0, %%"REG_b" \n\t"
> > +
> > + "paddd %%xmm1, %%xmm0 \n\t"
> > + "paddd %%xmm1, %%xmm2 \n\t"
> > + "paddd %%xmm1, %%xmm4 \n\t"
> > + "paddd %%xmm1, %%xmm6 \n\t"
> > +
> > + "psrad $4, %%xmm0 \n\t"
> > + "psrad $4, %%xmm2 \n\t"
> > + "psrad $4, %%xmm4 \n\t"
> > + "psrad $4, %%xmm6 \n\t"
> > +
> > + "paddd (%%"REG_c",%%"REG_d",4), %%xmm0 \n\t"
> > + "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2 \n\t"
> > + "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4 \n\t"
> > + "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4) \n\t"
> > +
> > + "paddd (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> > + "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> > + "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> > + "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "movdqa %%xmm0, %%xmm1 \n\t"
> > + "movdqa %%xmm2, %%xmm3 \n\t"
> > + "movdqa %%xmm4, %%xmm5 \n\t"
> > + "movdqa %%xmm6, %%xmm7 \n\t"
> > +
> > + "pslld $1, %%xmm0 \n\t"
> > + "pslld $1, %%xmm2 \n\t"
> > + "pslld $1, %%xmm4 \n\t"
> > + "pslld $1, %%xmm6 \n\t"
> > +
> > + "paddd %%xmm1, %%xmm0 \n\t"
> > + "paddd %%xmm3, %%xmm2 \n\t"
> > + "paddd %%xmm5, %%xmm4 \n\t"
> > + "paddd %%xmm7, %%xmm6 \n\t"
> > +
> > + "psrad $1, %%xmm0 \n\t"
> > + "psrad $1, %%xmm2 \n\t"
> > + "psrad $1, %%xmm4 \n\t"
> > + "psrad $1, %%xmm6 \n\t"
> > +
> > + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> > + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> > + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> > + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> > +
> > + "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4) \n\t"
> > + "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4) \n\t"
> > +
> > + "2: \n\t"
> > + "sub $16, %%"REG_d" \n\t"
> > + "jge 1b \n\t"
> > + ::
> > + "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"d"(end_w2):
> > + "%"REG_a"","%"REG_b"","%"REG_c"");
>
> this code is not valid, REG_d is changed but neither output nor on the clobber list
>
REG_d is on the input list, so GCC recognizes it as clobbered? GCC
also refuses that I put it REG_d into the clobber list. I believe the
code is good as is?
> [...]
>
> > +static inline void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> > + int y, x;
> > + DWTELEM * dst;
> > + DWTELEM * * dst_array = sb->line + src_y;
> > +
> > + asm volatile(
> > + "mov %5, %%ebx \n\t"
> > + "mov %3, %%"REG_S" \n\t"
> > + "pcmpeqd %%xmm4, %%xmm4 \n\t"
> > + "pslld $31, %%xmm4 \n\t"
> > + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */
> > + "psrld $24, %%xmm4 \n\t" /* FRAC_BITS >> 1 */
> > +
> > + "1: \n\t"
> > + "movq (%%"REG_S"), %%xmm0 \n\t"
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%xmm7, %%xmm0 \n\t"
> > + "movq 8(%%"REG_S"), %%xmm1 \n\t"
> > + "punpcklbw %%xmm7, %%xmm1 \n\t"
> > + "movq (%%"REG_d"), %%xmm5 \n\t"
> > + "mov %1, %%"REG_D" \n\t"
> > + "punpcklbw %%xmm7, %%xmm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movq (%%"REG_d"), %%xmm6 \n\t"
> > + "pmullw %%xmm0, %%xmm5 \n\t"
> > + "punpcklbw %%xmm7, %%xmm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > + "mov (%%"REG_D"), %%"REG_D" \n\t"
> > +
> > + "movq 128(%%"REG_S"), %%xmm0 \n\t"
> > + "pmullw %%xmm1, %%xmm6 \n\t"
> > + "punpcklbw %%xmm7, %%xmm0 \n\t"
> > + "movq 136(%%"REG_S"), %%xmm1 \n\t"
> > + "add %2, %%"REG_D" \n\t"
> > + "punpcklbw %%xmm7, %%xmm1 \n\t"
> > + "movq (%%"REG_d"), %%xmm2 \n\t"
> > + "punpcklbw %%xmm7, %%xmm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%xmm5, %%xmm6 \n\t"
> > + "pmullw %%xmm0, %%xmm2 \n\t"
> > + "movq (%%"REG_d"), %%xmm3 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > + "punpcklbw %%xmm7, %%xmm3 \n\t"
> > + "paddusw %%xmm2, %%xmm6 \n\t"
> > + "pmullw %%xmm1, %%xmm3 \n\t"
> > + "paddusw %%xmm3, %%xmm6 \n\t"
> > +
> > + "movdqa (%%"REG_D"), %%xmm3 \n\t"
> > + "movdqa %%xmm6, %%xmm0 \n\t"
> > + "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
> > + "punpckhwd %%xmm7, %%xmm6 \n\t"
> > + "movq 24(%%"REG_S"), %%xmm1 \n\t"
> > + "punpcklwd %%xmm7, %%xmm0 \n\t"
> > + "paddd %%xmm0, %%xmm3 \n\t"
> > + "paddd %%xmm6, %%xmm5 \n\t"
> > + "punpcklbw %%xmm7, %%xmm1 \n\t"
> > + "paddd %%xmm4, %%xmm3 \n\t"
> > + "paddd %%xmm4, %%xmm5 \n\t"
> > + "movq 16(%%"REG_S"), %%xmm0 \n\t"
> > + "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
> > + "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
> > +
> > + "packssdw %%xmm5, %%xmm3 \n\t"
> > + "mov %1, %%"REG_D" \n\t"
> > + "packuswb %%xmm7, %%xmm3 \n\t"
> > +
> > + "movq %%xmm3, (%%"REG_d") \n\t"
> > +
> > +
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%xmm7, %%xmm0 \n\t"
> > + "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
> > + "punpcklbw %%xmm7, %%xmm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
> > + "pmullw %%xmm0, %%xmm5 \n\t"
> > + "punpcklbw %%xmm7, %%xmm6 \n\t"
> > +
> > + "movq 144(%%"REG_S"), %%xmm0 \n\t"
> > + "pmullw %%xmm1, %%xmm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%xmm7, %%xmm0 \n\t"
> > + "movq 152(%%"REG_S"), %%xmm1 \n\t"
> > + "punpcklbw %%xmm7, %%xmm1 \n\t"
> > + "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
> > + "punpcklbw %%xmm7, %%xmm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%xmm5, %%xmm6 \n\t"
> > + "pmullw %%xmm0, %%xmm2 \n\t"
> > + "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
> > + "punpcklbw %%xmm7, %%xmm3 \n\t"
> > + "paddusw %%xmm2, %%xmm6 \n\t"
> > + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
> > + "pmullw %%xmm1, %%xmm3 \n\t"
> > + "sal $1, %%"REG_c" \n\t"
> > + "add %2, %%"REG_D" \n\t"
> > + "paddusw %%xmm3, %%xmm6 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > +
> > + "movdqa (%%"REG_D"), %%xmm3 \n\t"
> > + "movdqa %%xmm6, %%xmm0 \n\t"
> > + "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
> > + "punpckhwd %%xmm7, %%xmm6 \n\t"
> > + "punpcklwd %%xmm7, %%xmm0 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> > + "paddd %%xmm0, %%xmm3 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> > + "paddd %%xmm6, %%xmm5 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> > + "paddd %%xmm4, %%xmm3 \n\t"
> > + "add %%"REG_c", (%%"REG_a") \n\t"
> > + "paddd %%xmm4, %%xmm5 \n\t"
> > + "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
> > + "add $"PTR_SIZE"*2, %1 \n\t"
> > + "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
> > + "add $32, %%"REG_S" \n\t"
> > +
> > + "packssdw %%xmm5, %%xmm3 \n\t"
> > + "add %%"REG_c", %0 \n\t"
> > + "packuswb %%xmm7, %%xmm3 \n\t"
> > +
> > + "sar $1, %%"REG_c" \n\t"
> > + "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
> > +
> > + "sub $2, %%"REG_b" \n\t"
> > + "jnz 1b \n\t"
> > + :
> > + :
> > + "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"(b_h),"c"(src_stride):
> > + "%"REG_b"","%"REG_S"","%"REG_D"","%"REG_d"");
>
> a minor issue, dont use ebx please, it causes PIC fanboys to flame us
> and a major one REG_c is changed and not an output or cloberlisted
>
ebx changed to REG_b. h_b (input to the function) has been changed from
type int to type long so that this fix will work.
REG_c is an input. GCC refuses to allow it on the clobber list.
>
> [...]
> > +
> > +static inline void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> > + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> > + int y, x;
> > + DWTELEM * dst;
> > + DWTELEM * * dst_array = sb->line + src_y;
> > +
> > + asm volatile(
> > + "mov %5, %%ebx \n\t"
> > + "mov %3, %%"REG_S" \n\t"
> > + "pcmpeqd %%mm4, %%mm4 \n\t"
> > + "pslld $31, %%mm4 \n\t"
> > + "pxor %%mm7, %%mm7 \n\t" /* 0 */
> > + "psrld $24, %%mm4 \n\t" /* FRAC_BITS >> 1 */
> > +
> > + "1: \n\t"
> > + "movd (%%"REG_S"), %%mm0 \n\t"
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 16(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd (%%"REG_d"), %%mm5 \n\t"
> > + "mov %1, %%"REG_D" \n\t"
> > + "punpcklbw %%mm7, %%mm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movd (%%"REG_d"), %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +
> > + "movd 512(%%"REG_S"), %%mm0 \n\t"
> > + "pmullw %%mm1, %%mm6 \n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 528(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd (%%"REG_d"), %%mm2 \n\t"
> > + "punpcklbw %%mm7, %%mm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%mm5, %%mm6 \n\t"
> > + "mov (%%"REG_D"), %%"REG_D" \n\t"
> > + "pmullw %%mm0, %%mm2 \n\t"
> > + "movd (%%"REG_d"), %%mm3 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > + "punpcklbw %%mm7, %%mm3 \n\t"
> > + "add %2, %%"REG_D" \n\t"
> > + "paddusw %%mm2, %%mm6 \n\t"
> > + "pmullw %%mm1, %%mm3 \n\t"
> > + "paddusw %%mm3, %%mm6 \n\t"
> > +
> > + "movq (%%"REG_D"), %%mm3 \n\t"
> > + "movq %%mm6, %%mm0 \n\t"
> > + "movq 8(%%"REG_D"), %%mm5 \n\t"
> > + "punpckhwd %%mm7, %%mm6 \n\t"
> > + "movd 20(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklwd %%mm7, %%mm0 \n\t"
> > + "paddd %%mm0, %%mm3 \n\t"
> > + "paddd %%mm6, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "paddd %%mm4, %%mm3 \n\t"
> > + "paddd %%mm4, %%mm5 \n\t"
> > + "movd 4(%%"REG_S"), %%mm0 \n\t"
> > + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> > + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> > +
> > + "packssdw %%mm5, %%mm3 \n\t"
> > + "packuswb %%mm7, %%mm3 \n\t"
> > +
> > + "movd %%mm3, (%%"REG_d") \n\t"
> > +
> > +
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 4(%%"REG_d"), %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movd 4(%%"REG_d"), %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm6 \n\t"
> > +
> > + "movd 516(%%"REG_S"), %%mm0 \n\t"
> > + "pmullw %%mm1, %%mm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 532(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd 4(%%"REG_d"), %%mm2 \n\t"
> > + "punpcklbw %%mm7, %%mm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%mm5, %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm2 \n\t"
> > + "movd 4(%%"REG_d"), %%mm3 \n\t"
> > + "punpcklbw %%mm7, %%mm3 \n\t"
> > + "paddusw %%mm2, %%mm6 \n\t"
> > + "pmullw %%mm1, %%mm3 \n\t"
> > + "paddusw %%mm3, %%mm6 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > +
> > + "movq 16(%%"REG_D"), %%mm3 \n\t"
> > + "movq %%mm6, %%mm0 \n\t"
> > + "movq 24(%%"REG_D"), %%mm5 \n\t"
> > + "punpckhwd %%mm7, %%mm6 \n\t"
> > + "punpcklwd %%mm7, %%mm0 \n\t"
> > + "paddd %%mm0, %%mm3 \n\t"
> > + "paddd %%mm6, %%mm5 \n\t"
> > + "paddd %%mm4, %%mm3 \n\t"
> > + "paddd %%mm4, %%mm5 \n\t"
> > + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> > + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> > +
> > + "packssdw %%mm5, %%mm3 \n\t"
> > + "packuswb %%mm7, %%mm3 \n\t"
> > +
> > + "movd %%mm3, 4(%%"REG_d") \n\t"
> > +
> > +
> > +
> > + "movd 8(%%"REG_S"), %%mm0 \n\t"
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 24(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd 8(%%"REG_d"), %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movd 8(%%"REG_d"), %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > +
> > + "movd 520(%%"REG_S"), %%mm0 \n\t"
> > + "pmullw %%mm1, %%mm6 \n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 536(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd 8(%%"REG_d"), %%mm2 \n\t"
> > + "punpcklbw %%mm7, %%mm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%mm5, %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm2 \n\t"
> > + "movd 8(%%"REG_d"), %%mm3 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > + "punpcklbw %%mm7, %%mm3 \n\t"
> > + "paddusw %%mm2, %%mm6 \n\t"
> > + "pmullw %%mm1, %%mm3 \n\t"
> > + "paddusw %%mm3, %%mm6 \n\t"
> > +
> > + "movq 32(%%"REG_D"), %%mm3 \n\t"
> > + "movq %%mm6, %%mm0 \n\t"
> > + "movq 40(%%"REG_D"), %%mm5 \n\t"
> > + "punpckhwd %%mm7, %%mm6 \n\t"
> > + "movd 28(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklwd %%mm7, %%mm0 \n\t"
> > + "paddd %%mm0, %%mm3 \n\t"
> > + "paddd %%mm6, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "paddd %%mm4, %%mm3 \n\t"
> > + "paddd %%mm4, %%mm5 \n\t"
> > + "movd 12(%%"REG_S"), %%mm0 \n\t"
> > + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> > + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> > +
> > + "packssdw %%mm5, %%mm3 \n\t"
> > + "packuswb %%mm7, %%mm3 \n\t"
> > +
> > + "movd %%mm3, 8(%%"REG_d") \n\t"
> > +
> > +
> > + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 12(%%"REG_d"), %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm5 \n\t"
> > + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> > + "movd 12(%%"REG_d"), %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm5 \n\t"
> > + "punpcklbw %%mm7, %%mm6 \n\t"
> > +
> > + "movd 524(%%"REG_S"), %%mm0 \n\t"
> > + "pmullw %%mm1, %%mm6 \n\t"
> > + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> > + "punpcklbw %%mm7, %%mm0 \n\t"
> > + "movd 540(%%"REG_S"), %%mm1 \n\t"
> > + "punpcklbw %%mm7, %%mm1 \n\t"
> > + "movd 12(%%"REG_d"), %%mm2 \n\t"
> > + "punpcklbw %%mm7, %%mm2 \n\t"
> > + "mov (%%"REG_a"), %%"REG_d" \n\t"
> > + "paddusw %%mm5, %%mm6 \n\t"
> > + "pmullw %%mm0, %%mm2 \n\t"
> > + "movd 12(%%"REG_d"), %%mm3 \n\t"
> > + "punpcklbw %%mm7, %%mm3 \n\t"
> > + "paddusw %%mm2, %%mm6 \n\t"
> > + "pmullw %%mm1, %%mm3 \n\t"
> > + "paddusw %%mm3, %%mm6 \n\t"
> > + "mov %0, %%"REG_d" \n\t"
> > +
> > + "movq 48(%%"REG_D"), %%mm3 \n\t"
> > + "movq %%mm6, %%mm0 \n\t"
> > + "movq 56(%%"REG_D"), %%mm5 \n\t"
> > + "punpckhwd %%mm7, %%mm6 \n\t"
> > + "punpcklwd %%mm7, %%mm0 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> > + "paddd %%mm0, %%mm3 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> > + "paddd %%mm6, %%mm5 \n\t"
> > + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> > + "paddd %%mm4, %%mm3 \n\t"
> > + "add %%"REG_c", (%%"REG_a") \n\t"
> > + "paddd %%mm4, %%mm5 \n\t"
> > + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> > + "add $"PTR_SIZE"*1, %1 \n\t"
> > + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> > + "add $32, %%"REG_S" \n\t"
> > +
> > + "packssdw %%mm5, %%mm3 \n\t"
> > + "add %%"REG_c", %0 \n\t"
> > + "packuswb %%mm7, %%mm3 \n\t"
> > +
> > + "movd %%mm3, 12(%%"REG_d") \n\t"
> > +
> > + "dec %%"REG_b" \n\t"
> > + "jnz 1b \n\t"
> > + "emms \t\t"
>
> is the emms here really needed?
>
Fixed. Experimentally, they are not needed, and have been removed.
> [...]
>
Thanks for your many useful comments Michael. Would you mind having a
look at the updated patch?
Sincerely,
Robert Edele
-------------- next part --------------
A non-text attachment was scrubbed...
Name: snow_mmx.patch
Type: text/x-patch
Size: 93269 bytes
Desc:
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20060307/7ce5a015/attachment.bin>
More information about the ffmpeg-devel
mailing list