[MPlayer-dev-eng] Re: Re: Re: fast SPP - working code

Reimar Döffinger Reimar.Doeffinger at stud.uni-karlsruhe.de
Sat Jun 4 18:26:37 CEST 2005


Hi,
> > I JUST fixed it, but apperantely I didn't.
> > 
> > {standard input}:791: Error: Incorrect register `%rdx' used with `l' suffix
> > {standard input}:800: Error: Incorrect register `%rsi' used with `l' suffix
> > 
> > Apperantely using commands like 'addl' on those registers is obviously 
> > illegal.. I'm not sure what to do at this point, i've never made 64-bit 
> > code...
> 
> I think that most of them should be just replaced with the instructions
> without the suffix (addl->add). But I didn't code any 64bit asm either.

If that was all... I really though that by now everyone knows that
casting a pointer to int is not a good idea. Obviously not.
The attached patch fixes it on AMD64, though it isn't a beauty (esp. the int ->
long changes).
I'm also not 100% percent sure the __attribute__((aligned(32))) for that
on-stack variable will be taken into account by all compilers, but it
certainly worked for me (and I'm also not sure if 32 isn't overkill)...

Greetings,
Reimar Döffinger
-------------- next part --------------
Index: libmpcodecs/vf_fspp.c
===================================================================
RCS file: /cvsroot/mplayer/main/libmpcodecs/vf_fspp.c,v
retrieving revision 1.3
diff -u -r1.3 vf_fspp.c
--- libmpcodecs/vf_fspp.c	4 Jun 2005 12:41:19 -0000	1.3
+++ libmpcodecs/vf_fspp.c	4 Jun 2005 16:13:58 -0000
@@ -38,11 +38,6 @@
 
 #include "../config.h"
 
-#ifdef ARCH_X86_64
-// until the mmx code is fixed to support x86-64
-#undef HAVE_MMX
-#endif
-
 #ifdef USE_LIBAVCODEC
 
 #include "../mp_msg.h"
@@ -187,213 +182,213 @@
 #else /* HAVE_MMX */
 
 //This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
+static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
 {
     const uint8_t *od=&dither[0][0];
+    const uint8_t *end=&dither[height][0];
     width = (width+7)&~7;
     dst_stride-=width;
     //src_stride=(src_stride-width)*2;
-    height=(int)(&dither[height][0]);
     asm volatile(
-	"movl %5, %%edx                \n\t"
-	"movl %6, %%esi                \n\t"
-	"movl %7, %%edi                \n\t"
-	"movl %1, %%eax                \n\t"
-	"movd %%edx, %%mm5             \n\t"
-	"xorl $-1, %%edx              \n\t"
-	"movl %%eax, %%ecx             \n\t"
-	"addl $7, %%edx               \n\t"
-	"negl %%eax                   \n\t"
-	"subl %0, %%ecx            \n\t"
-	"addl %%ecx, %%ecx             \n\t"
-	"movd %%edx, %%mm2             \n\t"
-	"movl %%ecx, %1       \n\t"
-	"movl %2, %%edx               \n\t"
-	"shll $4, %%eax               \n\t"
+	"mov %5, %%"REG_d"                \n\t"
+	"mov %6, %%"REG_S"                \n\t"
+	"mov %7, %%"REG_D"                \n\t"
+	"mov %1, %%"REG_a"                \n\t"
+	"movd %%"REG_d", %%mm5             \n\t"
+	"xor $-1, %%"REG_d"              \n\t"
+	"mov %%"REG_a", %%"REG_c"             \n\t"
+	"add $7, %%"REG_d"               \n\t"
+	"neg %%"REG_a"                   \n\t"
+	"sub %0, %%"REG_c"            \n\t"
+	"add %%"REG_c", %%"REG_c"             \n\t"
+	"movd %%"REG_d", %%mm2             \n\t"
+	"mov %%"REG_c", %1       \n\t"
+	"mov %2, %%"REG_d"               \n\t"
+	"shl $4, %%"REG_a"               \n\t"
 
 	"2:                        \n\t"
-	"movq (%%edx), %%mm3           \n\t"
+	"movq (%%"REG_d"), %%mm3           \n\t"
 	"movq %%mm3, %%mm4             \n\t"
 	"pxor %%mm7, %%mm7             \n\t"
 	"punpcklbw %%mm7, %%mm3        \n\t"
 	"punpckhbw %%mm7, %%mm4        \n\t"
-	"movl %0, %%ecx            \n\t"
+	"mov %0, %%"REG_c"            \n\t"
 	"psraw %%mm5, %%mm3            \n\t"
 	"psraw %%mm5, %%mm4            \n\t"
 	"1:                        \n\t"
-	"movq %%mm7, (%%esi,%%eax,)     \n\t"
-	"movq (%%esi), %%mm0           \n\t"
-	"movq 8(%%esi), %%mm1          \n\t"
+	"movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
+	"movq (%%"REG_S"), %%mm0           \n\t"
+	"movq 8(%%"REG_S"), %%mm1          \n\t"
 
-	"movq %%mm7, 8(%%esi,%%eax,)    \n\t"
+	"movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
 	"paddw %%mm3, %%mm0            \n\t"
 	"paddw %%mm4, %%mm1            \n\t"
 
-	"movq %%mm7, (%%esi)           \n\t"
+	"movq %%mm7, (%%"REG_S")           \n\t"
 	"psraw %%mm2, %%mm0            \n\t"
 	"psraw %%mm2, %%mm1            \n\t"
 
-	"movq %%mm7, 8(%%esi)          \n\t"
+	"movq %%mm7, 8(%%"REG_S")          \n\t"
 	"packuswb %%mm1, %%mm0         \n\t"
-	"addl $16, %%esi              \n\t"
+	"add $16, %%"REG_S"              \n\t"
 
-	"movq %%mm0, (%%edi)           \n\t"
-	"addl $8, %%edi               \n\t"
-	"subl $8, %%ecx               \n\t"
+	"movq %%mm0, (%%"REG_D")           \n\t"
+	"add $8, %%"REG_D"               \n\t"
+	"sub $8, %%"REG_c"               \n\t"
 	"jg 1b                      \n\t"
-	"addl %1, %%esi       \n\t"
-	"addl $8, %%edx               \n\t"
-	"addl %3, %%edi       \n\t"
-	"cmpl %4, %%edx           \n\t"
+	"add %1, %%"REG_S"       \n\t"
+	"add $8, %%"REG_d"               \n\t"
+	"add %3, %%"REG_D"       \n\t"
+	"cmp %4, %%"REG_d"           \n\t"
 	"jl 2b                      \n\t"
 
 	:
-	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (height),
+	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end),
 	  "m" (log2_scale), "m" (src), "m" (dst) //input
-	: "%eax", "%ecx", "%edx", "%esi", "%edi"
+	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
 	);    
 }
 
 //This func reads from 2 slices, 0 & 2  and clears 2-nd
-static void store_slice2_mmx(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
+static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
 {
     const uint8_t *od=&dither[0][0];
+    const uint8_t *end=&dither[height][0];
     width = (width+7)&~7;
     dst_stride-=width;
     //src_stride=(src_stride-width)*2;
-    height=(int)(&dither[height][0]);
     asm volatile(
-	"movl %5, %%edx                \n\t"
-	"movl %6, %%esi                \n\t"
-	"movl %7, %%edi                \n\t"
-	"movl %1, %%eax            \n\t"
-	"movd %%edx, %%mm5             \n\t"
-	"xorl $-1, %%edx              \n\t"
-	"movl %%eax, %%ecx             \n\t"
-	"addl $7, %%edx               \n\t"
-	"subl %0, %%ecx            \n\t"
-	"addl %%ecx, %%ecx             \n\t"
-	"movd %%edx, %%mm2             \n\t"
-	"movl %%ecx, %1       \n\t"
-	"movl %2, %%edx               \n\t"
-	"shll $5, %%eax               \n\t"
+	"mov %5, %%"REG_d"                \n\t"
+	"mov %6, %%"REG_S"                \n\t"
+	"mov %7, %%"REG_D"                \n\t"
+	"mov %1, %%"REG_a"            \n\t"
+	"movd %%"REG_d", %%mm5             \n\t"
+	"xor $-1, %%"REG_d"              \n\t"
+	"mov %%"REG_a", %%"REG_c"             \n\t"
+	"add $7, %%"REG_d"               \n\t"
+	"sub %0, %%"REG_c"            \n\t"
+	"add %%"REG_c", %%"REG_c"             \n\t"
+	"movd %%"REG_d", %%mm2             \n\t"
+	"mov %%"REG_c", %1       \n\t"
+	"mov %2, %%"REG_d"               \n\t"
+	"shl $5, %%"REG_a"               \n\t"
 
 	"2:                        \n\t"
-	"movq (%%edx), %%mm3           \n\t"
+	"movq (%%"REG_d"), %%mm3           \n\t"
 	"movq %%mm3, %%mm4             \n\t"
 	"pxor %%mm7, %%mm7             \n\t"
 	"punpcklbw %%mm7, %%mm3        \n\t"
 	"punpckhbw %%mm7, %%mm4        \n\t"
-	"movl %0, %%ecx            \n\t"
+	"mov %0, %%"REG_c"            \n\t"
 	"psraw %%mm5, %%mm3            \n\t"
 	"psraw %%mm5, %%mm4            \n\t"
 	"1:                        \n\t"
-	"movq (%%esi), %%mm0           \n\t"
-	"movq 8(%%esi), %%mm1          \n\t"
+	"movq (%%"REG_S"), %%mm0           \n\t"
+	"movq 8(%%"REG_S"), %%mm1          \n\t"
 	"paddw %%mm3, %%mm0            \n\t"
 
-	"paddw (%%esi,%%eax,), %%mm0    \n\t"
+	"paddw (%%"REG_S",%%"REG_a",), %%mm0    \n\t"
 	"paddw %%mm4, %%mm1            \n\t"
-	"movq 8(%%esi,%%eax,), %%mm6    \n\t"
+	"movq 8(%%"REG_S",%%"REG_a",), %%mm6    \n\t"
 
-	"movq %%mm7, (%%esi,%%eax,)     \n\t"
+	"movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
 	"psraw %%mm2, %%mm0            \n\t"
 	"paddw %%mm6, %%mm1            \n\t"
 
-	"movq %%mm7, 8(%%esi,%%eax,)    \n\t"
+	"movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
 	"psraw %%mm2, %%mm1            \n\t"
 	"packuswb %%mm1, %%mm0         \n\t"
 
-	"movq %%mm0, (%%edi)           \n\t"
-	"addl $16, %%esi              \n\t"
-	"addl $8, %%edi               \n\t"
-	"subl $8, %%ecx               \n\t"
+	"movq %%mm0, (%%"REG_D")           \n\t"
+	"add $16, %%"REG_S"              \n\t"
+	"add $8, %%"REG_D"               \n\t"
+	"sub $8, %%"REG_c"               \n\t"
 	"jg 1b                      \n\t"
-	"addl %1, %%esi       \n\t"
-	"addl $8, %%edx               \n\t"
-	"addl %3, %%edi       \n\t"
-	"cmpl %4, %%edx           \n\t"
+	"add %1, %%"REG_S"       \n\t"
+	"add $8, %%"REG_d"               \n\t"
+	"add %3, %%"REG_D"       \n\t"
+	"cmp %4, %%"REG_d"           \n\t"
 	"jl 2b                      \n\t"
 
 	:
-	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (height),
+	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end),
 	  "m" (log2_scale), "m" (src), "m" (dst) //input
-	: "%eax", "%ecx", "%edx", "%edi", "%esi"
+	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
 	);  
 }
 
 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
 {
-    int adr=(int)(&p->threshold_mtx_noq[0]);
+    uint64_t *adr=&p->threshold_mtx_noq[0];
     asm volatile(
 	"movd %0, %%mm7                \n\t"
-	"addl $8*8*2, %%edi            \n\t"
-	"movq 0*8(%%esi), %%mm0        \n\t"
+	"add $8*8*2, %%"REG_D"            \n\t"
+	"movq 0*8(%%"REG_S"), %%mm0        \n\t"
 	"punpcklwd %%mm7, %%mm7        \n\t"
-	"movq 1*8(%%esi), %%mm1        \n\t"
+	"movq 1*8(%%"REG_S"), %%mm1        \n\t"
 	"punpckldq %%mm7, %%mm7        \n\t"
 	"pmullw %%mm7, %%mm0           \n\t"
 
-	"movq 2*8(%%esi), %%mm2        \n\t"
+	"movq 2*8(%%"REG_S"), %%mm2        \n\t"
 	"pmullw %%mm7, %%mm1           \n\t"
 
-	"movq 3*8(%%esi), %%mm3        \n\t"
+	"movq 3*8(%%"REG_S"), %%mm3        \n\t"
 	"pmullw %%mm7, %%mm2           \n\t"
 
-	"movq %%mm0, 0*8(%%edi)        \n\t"
-	"movq 4*8(%%esi), %%mm4        \n\t"
+	"movq %%mm0, 0*8(%%"REG_D")        \n\t"
+	"movq 4*8(%%"REG_S"), %%mm4        \n\t"
 	"pmullw %%mm7, %%mm3           \n\t"
 
-	"movq %%mm1, 1*8(%%edi)        \n\t"
-	"movq 5*8(%%esi), %%mm5        \n\t"
+	"movq %%mm1, 1*8(%%"REG_D")        \n\t"
+	"movq 5*8(%%"REG_S"), %%mm5        \n\t"
 	"pmullw %%mm7, %%mm4           \n\t"
 
-	"movq %%mm2, 2*8(%%edi)        \n\t"
-	"movq 6*8(%%esi), %%mm6        \n\t"
+	"movq %%mm2, 2*8(%%"REG_D")        \n\t"
+	"movq 6*8(%%"REG_S"), %%mm6        \n\t"
 	"pmullw %%mm7, %%mm5           \n\t"
 
-	"movq %%mm3, 3*8(%%edi)        \n\t"
-	"movq 7*8+0*8(%%esi), %%mm0    \n\t"
+	"movq %%mm3, 3*8(%%"REG_D")        \n\t"
+	"movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"
 	"pmullw %%mm7, %%mm6           \n\t"
 
-	"movq %%mm4, 4*8(%%edi)        \n\t"
-	"movq 7*8+1*8(%%esi), %%mm1    \n\t"
+	"movq %%mm4, 4*8(%%"REG_D")        \n\t"
+	"movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"
 	"pmullw %%mm7, %%mm0           \n\t"
 
-	"movq %%mm5, 5*8(%%edi)        \n\t"
-	"movq 7*8+2*8(%%esi), %%mm2    \n\t"
+	"movq %%mm5, 5*8(%%"REG_D")        \n\t"
+	"movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"
 	"pmullw %%mm7, %%mm1           \n\t"
 
-	"movq %%mm6, 6*8(%%edi)        \n\t"
-	"movq 7*8+3*8(%%esi), %%mm3    \n\t"
+	"movq %%mm6, 6*8(%%"REG_D")        \n\t"
+	"movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"
 	"pmullw %%mm7, %%mm2           \n\t"
 
-	"movq %%mm0, 7*8+0*8(%%edi)    \n\t"
-	"movq 7*8+4*8(%%esi), %%mm4    \n\t"
+	"movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"
+	"movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"
 	"pmullw %%mm7, %%mm3           \n\t"
 
-	"movq %%mm1, 7*8+1*8(%%edi)    \n\t"
-	"movq 7*8+5*8(%%esi), %%mm5    \n\t"
+	"movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"
+	"movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"
 	"pmullw %%mm7, %%mm4           \n\t"
 
-	"movq %%mm2, 7*8+2*8(%%edi)    \n\t"
-	"movq 7*8+6*8(%%esi), %%mm6    \n\t"
+	"movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"
+	"movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"
 	"pmullw %%mm7, %%mm5           \n\t"
 
-	"movq %%mm3, 7*8+3*8(%%edi)    \n\t"
-	"movq 14*8+0*8(%%esi), %%mm0   \n\t"
+	"movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"
+	"movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"
 	"pmullw %%mm7, %%mm6           \n\t"
 
-	"movq %%mm4, 7*8+4*8(%%edi)    \n\t"
-	"movq 14*8+1*8(%%esi), %%mm1   \n\t"
+	"movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"
+	"movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"
 	"pmullw %%mm7, %%mm0           \n\t"
 
-	"movq %%mm5, 7*8+5*8(%%edi)    \n\t"
+	"movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"
 	"pmullw %%mm7, %%mm1           \n\t"
 
-	"movq %%mm6, 7*8+6*8(%%edi)    \n\t"
-	"movq %%mm0, 14*8+0*8(%%edi)   \n\t"
-	"movq %%mm1, 14*8+1*8(%%edi)   \n\t"
+	"movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"
+	"movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"
+	"movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"
 
 	: "+g" (q), "+S" (adr), "+D" (adr)
 	:
@@ -422,8 +417,7 @@
     const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
     const int step=6-p->log2_count;
     const int qps= 3 + is_luma; 
-    int32_t block_align1[4*8*BLOCKSZ+ 4*8*BLOCKSZ+8];//32
-    int32_t *block_align=(int32_t*)(((int)block_align1+31)&(~31));
+    int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
     DCTELEM *block= (DCTELEM *)block_align;
     DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);    
 
@@ -877,27 +871,27 @@
     asm volatile(
 	".align 16                   \n\t"
 	"1:                   \n\t"
-	"movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t"
+	"movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
 	//
-	"movq "DCTSIZE_S"*3*2(%%esi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
 	"movq %%mm1, %%mm0             \n\t"
 
-	"paddw "DCTSIZE_S"*7*2(%%esi), %%mm1 \n\t" //t0    
+	"paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0    
 	"movq %%mm7, %%mm3             \n\t"
 
-	"paddw "DCTSIZE_S"*4*2(%%esi), %%mm7 \n\t" //t3
+	"paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
 	"movq %%mm1, %%mm5             \n\t"
 
-	"movq "DCTSIZE_S"*1*2(%%esi), %%mm6 \n\t"
+	"movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //t13
 
-	"movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
 	"movq %%mm6, %%mm4             \n\t"
 
-	"paddw "DCTSIZE_S"*6*2(%%esi), %%mm6 \n\t" //t1
+	"paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
 	"paddw %%mm7, %%mm5            \n\t" //t10
 
-	"paddw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t2
+	"paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
 	"movq %%mm6, %%mm7             \n\t"
 
 	"paddw %%mm2, %%mm6            \n\t" //t11    
@@ -909,21 +903,21 @@
 	"psubw %%mm6, %%mm2            \n\t" //d4      
 	"paddw %%mm1, %%mm7            \n\t"
 
-	"movq  4*16(%%edx), %%mm6      \n\t"
+	"movq  4*16(%%"REG_d"), %%mm6      \n\t"
 	"psllw $2, %%mm7              \n\t"
 
-	"psubw 0*16(%%edx), %%mm5      \n\t"
+	"psubw 0*16(%%"REG_d"), %%mm5      \n\t"
 	"psubw %%mm6, %%mm2            \n\t"
 
-	"paddusw 0*16(%%edx), %%mm5    \n\t"
+	"paddusw 0*16(%%"REG_d"), %%mm5    \n\t"
 	"paddusw %%mm6, %%mm2          \n\t"
 
 	"pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
 	//
-	"paddw 0*16(%%edx), %%mm5      \n\t"
+	"paddw 0*16(%%"REG_d"), %%mm5      \n\t"
 	"paddw %%mm6, %%mm2            \n\t"
 
-	"psubusw 0*16(%%edx), %%mm5    \n\t"
+	"psubusw 0*16(%%"REG_d"), %%mm5    \n\t"
 	"psubusw %%mm6, %%mm2          \n\t"
 
 //This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
@@ -938,23 +932,23 @@
 	"movq %%mm1, %%mm6             \n\t"
 	"paddw %%mm7, %%mm1            \n\t" //d2
 
-	"psubw 2*16(%%edx), %%mm1      \n\t"
+	"psubw 2*16(%%"REG_d"), %%mm1      \n\t"
 	"psubw %%mm7, %%mm6            \n\t" //d6
 
-	"movq 6*16(%%edx), %%mm7       \n\t"
+	"movq 6*16(%%"REG_d"), %%mm7       \n\t"
 	"psraw $2, %%mm5              \n\t"
 
-	"paddusw 2*16(%%edx), %%mm1    \n\t"
+	"paddusw 2*16(%%"REG_d"), %%mm1    \n\t"
 	"psubw %%mm7, %%mm6            \n\t"
 	// t7 d2 /t11 t4 t6 - d6 /t10     
 
-	"paddw 2*16(%%edx), %%mm1      \n\t"
+	"paddw 2*16(%%"REG_d"), %%mm1      \n\t"
 	"paddusw %%mm7, %%mm6          \n\t"
 
-	"psubusw 2*16(%%edx), %%mm1    \n\t"
+	"psubusw 2*16(%%"REG_d"), %%mm1    \n\t"
 	"paddw %%mm7, %%mm6            \n\t"
 
-	"psubw "DCTSIZE_S"*4*2(%%esi), %%mm3 \n\t"
+	"psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
 	"psubusw %%mm7, %%mm6          \n\t"
 
 	//movq [edi+"DCTSIZE_S"*2*2], mm1
@@ -962,10 +956,10 @@
 	"movq %%mm1, %%mm7             \n\t"
 	"psraw $2, %%mm2              \n\t"
 
-	"psubw "DCTSIZE_S"*6*2(%%esi), %%mm4 \n\t"
+	"psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
 	"psubw %%mm6, %%mm1            \n\t"
 
-	"psubw "DCTSIZE_S"*7*2(%%esi), %%mm0 \n\t"
+	"psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
 	"paddw %%mm7, %%mm6            \n\t" //'t13
 
 	"psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
@@ -977,10 +971,10 @@
 	"movq %%mm2, "MANGLE(temps)"+0*8       \n\t" //!
 	"psubw %%mm6, %%mm7            \n\t" //'t3
 
-	"movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
 	"psubw %%mm6, %%mm1            \n\t" //'t12        
 
-	"psubw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t5
+	"psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
 	"movq %%mm5, %%mm6             \n\t"
 
 	"movq %%mm7, "MANGLE(temps)"+3*8       \n\t"
@@ -1013,7 +1007,7 @@
 	"movq %%mm5, "MANGLE(temps)"+1*8       \n\t"
 	"paddw %%mm3, %%mm4            \n\t" //z4
 
-	"movq 3*16(%%edx), %%mm3       \n\t"
+	"movq 3*16(%%"REG_d"), %%mm3       \n\t"
 	"movq %%mm0, %%mm1             \n\t"
 
 	"movq %%mm6, "MANGLE(temps)"+2*8       \n\t"
@@ -1023,13 +1017,13 @@
 	"paddw %%mm2, %%mm0            \n\t" //z11 
 	"movq %%mm1, %%mm5             \n\t"
 
-	"movq 5*16(%%edx), %%mm2       \n\t"
+	"movq 5*16(%%"REG_d"), %%mm2       \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //d3
 
 	"paddw %%mm7, %%mm5            \n\t" //d5
 	"psubw %%mm3, %%mm1            \n\t"
 
-	"movq 1*16(%%edx), %%mm7       \n\t"
+	"movq 1*16(%%"REG_d"), %%mm7       \n\t"
 	"psubw %%mm2, %%mm5            \n\t"
 
 	"movq %%mm0, %%mm6             \n\t"
@@ -1039,7 +1033,7 @@
 	"psubw %%mm4, %%mm6            \n\t" //d7  
 
 	// d1 d3 - - - d5 d7 -    
-	"movq 7*16(%%edx), %%mm4       \n\t"
+	"movq 7*16(%%"REG_d"), %%mm4       \n\t"
 	"psubw %%mm7, %%mm0            \n\t"
 
 	"psubw %%mm4, %%mm6            \n\t"
@@ -1066,8 +1060,8 @@
 	"packssdw %%mm4, %%mm4         \n\t"
 	"psubusw %%mm7, %%mm0          \n\t"
 
-	"movd %%mm4, %%eax             \n\t"
-	"orl %%eax, %%eax              \n\t"
+	"movd %%mm4, %%"REG_a"             \n\t"
+	"or %%"REG_a", %%"REG_a"              \n\t"
 	"jnz 2f                 \n\t"
 	//movq [edi+"DCTSIZE_S"*3*2], mm1
 	//movq [edi+"DCTSIZE_S"*5*2], mm5
@@ -1082,7 +1076,7 @@
 	"pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
 	"movq %%mm1, %%mm2             \n\t"
 
-	"movq "DCTSIZE_S"*0*2(%%edi), %%mm5 \n\t"
+	"movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
 	"movq %%mm2, %%mm3             \n\t"
 
 	"pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
@@ -1095,49 +1089,49 @@
 	"pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
 	"psubw %%mm3, %%mm4            \n\t"
 
-	"movq "DCTSIZE_S"*1*2(%%edi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
 	"paddw %%mm3, %%mm5            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*7*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
 	"paddw %%mm6, %%mm7            \n\t"
 
 	"movq "MANGLE(temps)"+2*8, %%mm3       \n\t"
 	"psubw %%mm0, %%mm6            \n\t"
 
-	"movq "DCTSIZE_S"*2*2(%%edi), %%mm4 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
 	"paddw %%mm0, %%mm7            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*0*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
 	"paddw %%mm3, %%mm4            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*6*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
 	"psubw %%mm1, %%mm3            \n\t"
 
-	"movq "DCTSIZE_S"*5*2(%%edi), %%mm5 \n\t"
+	"movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
 	"paddw %%mm1, %%mm4            \n\t"
 
-	"movq "DCTSIZE_S"*3*2(%%edi), %%mm6 \n\t"
+	"movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
 	"paddw %%mm3, %%mm5            \n\t"
 
 	"movq "MANGLE(temps)"+3*8, %%mm0       \n\t"
-	"addl $8, %%esi               \n\t"
+	"add $8, %%"REG_S"               \n\t"
 
-	"movq %%mm7, "DCTSIZE_S"*1*2(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
 	"paddw %%mm0, %%mm6            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*2*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
 	"psubw %%mm2, %%mm0            \n\t"
 
-	"movq "DCTSIZE_S"*4*2(%%edi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
 	"paddw %%mm2, %%mm6            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*5*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
 	"paddw %%mm0, %%mm7            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*3*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
 
-	"movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t"
-	"addl $8, %%edi               \n\t"
+	"movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+	"add $8, %%"REG_D"               \n\t"
 	"jmp 4f                  \n\t"
 
 	"2:                    \n\t"
@@ -1179,16 +1173,16 @@
 	//paddw mm7, MM_2
 	"psraw $2, %%mm7              \n\t"
 
-	"paddw "DCTSIZE_S"*0*2(%%edi), %%mm4 \n\t"
+	"paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
 	"psubw %%mm7, %%mm6            \n\t"
 
 	"movq "MANGLE(temps)"+1*8, %%mm3       \n\t"
 	"paddw %%mm7, %%mm4            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*7*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
 	"paddw %%mm5, %%mm1            \n\t" //'t12
 
-	"movq %%mm4, "DCTSIZE_S"*0*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //'t6
 
 	"movq "MANGLE(temps)"+2*8, %%mm7       \n\t"
@@ -1197,65 +1191,65 @@
 	"movq "MANGLE(temps)"+3*8, %%mm6       \n\t"
 	"movq %%mm3, %%mm5             \n\t"
 
-	"paddw "DCTSIZE_S"*1*2(%%edi), %%mm3 \n\t"
+	"paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
 	"psubw %%mm1, %%mm5            \n\t"
 
 	"psubw %%mm1, %%mm2            \n\t" //'t5
 	"paddw %%mm1, %%mm3            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*6*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
 	"movq %%mm7, %%mm4             \n\t"
 
-	"paddw "DCTSIZE_S"*2*2(%%edi), %%mm7 \n\t"
+	"paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
 	"psubw %%mm2, %%mm4            \n\t"
 
-	"paddw "DCTSIZE_S"*5*2(%%edi), %%mm4 \n\t"
+	"paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
 	"paddw %%mm2, %%mm7            \n\t"
 
-	"movq %%mm3, "DCTSIZE_S"*1*2(%%edi) \n\t"
+	"movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
 	"paddw %%mm2, %%mm0            \n\t" //'t4     
 
 	// 't4 't6 't5 - - - - 't7
-	"movq %%mm7, "DCTSIZE_S"*2*2(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
 	"movq %%mm6, %%mm1             \n\t"
 
-	"paddw "DCTSIZE_S"*4*2(%%edi), %%mm6 \n\t"
+	"paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
 	"psubw %%mm0, %%mm1            \n\t"
 
-	"paddw "DCTSIZE_S"*3*2(%%edi), %%mm1 \n\t"
+	"paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
 	"paddw %%mm0, %%mm6            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*5*2(%%edi) \n\t"
-	"addl $8, %%esi               \n\t"
+	"movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+	"add $8, %%"REG_S"               \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*4*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
 
-	"movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t"
-	"addl $8, %%edi               \n\t"
+	"movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+	"add $8, %%"REG_D"               \n\t"
 
 	"4:                     \n\t"
 //=part 2 (the same)===========================================================    
-	"movq "DCTSIZE_S"*0*2(%%esi), %%mm1 \n\t"
+	"movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
 	//
-	"movq "DCTSIZE_S"*3*2(%%esi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
 	"movq %%mm1, %%mm0             \n\t"
 
-	"paddw "DCTSIZE_S"*7*2(%%esi), %%mm1 \n\t" //t0    
+	"paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0    
 	"movq %%mm7, %%mm3             \n\t"
 
-	"paddw "DCTSIZE_S"*4*2(%%esi), %%mm7 \n\t" //t3
+	"paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
 	"movq %%mm1, %%mm5             \n\t"
 
-	"movq "DCTSIZE_S"*1*2(%%esi), %%mm6 \n\t"
+	"movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //t13
 
-	"movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
 	"movq %%mm6, %%mm4             \n\t"
 
-	"paddw "DCTSIZE_S"*6*2(%%esi), %%mm6 \n\t" //t1
+	"paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
 	"paddw %%mm7, %%mm5            \n\t" //t10
 
-	"paddw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t2
+	"paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
 	"movq %%mm6, %%mm7             \n\t"
 
 	"paddw %%mm2, %%mm6            \n\t" //t11    
@@ -1267,21 +1261,21 @@
 	"psubw %%mm6, %%mm2            \n\t" //d4      
 	"paddw %%mm1, %%mm7            \n\t"
 
-	"movq  1*8+4*16(%%edx), %%mm6  \n\t"
+	"movq  1*8+4*16(%%"REG_d"), %%mm6  \n\t"
 	"psllw $2, %%mm7              \n\t"
 
-	"psubw 1*8+0*16(%%edx), %%mm5  \n\t"
+	"psubw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
 	"psubw %%mm6, %%mm2            \n\t"
 
-	"paddusw 1*8+0*16(%%edx), %%mm5 \n\t"
+	"paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
 	"paddusw %%mm6, %%mm2          \n\t"
 
 	"pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
 	//
-	"paddw 1*8+0*16(%%edx), %%mm5  \n\t"
+	"paddw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
 	"paddw %%mm6, %%mm2            \n\t"
 
-	"psubusw 1*8+0*16(%%edx), %%mm5 \n\t"
+	"psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
 	"psubusw %%mm6, %%mm2          \n\t"
 
 //This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
@@ -1296,23 +1290,23 @@
 	"movq %%mm1, %%mm6             \n\t"
 	"paddw %%mm7, %%mm1            \n\t" //d2
 
-	"psubw 1*8+2*16(%%edx), %%mm1  \n\t"
+	"psubw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
 	"psubw %%mm7, %%mm6            \n\t" //d6
 
-	"movq 1*8+6*16(%%edx), %%mm7   \n\t"
+	"movq 1*8+6*16(%%"REG_d"), %%mm7   \n\t"
 	"psraw $2, %%mm5              \n\t"
 
-	"paddusw 1*8+2*16(%%edx), %%mm1 \n\t"
+	"paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
 	"psubw %%mm7, %%mm6            \n\t"
 	// t7 d2 /t11 t4 t6 - d6 /t10     
 
-	"paddw 1*8+2*16(%%edx), %%mm1  \n\t"
+	"paddw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
 	"paddusw %%mm7, %%mm6          \n\t"
 
-	"psubusw 1*8+2*16(%%edx), %%mm1 \n\t"
+	"psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
 	"paddw %%mm7, %%mm6            \n\t"
 
-	"psubw "DCTSIZE_S"*4*2(%%esi), %%mm3 \n\t"
+	"psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
 	"psubusw %%mm7, %%mm6          \n\t"
 
 	//movq [edi+"DCTSIZE_S"*2*2], mm1
@@ -1320,10 +1314,10 @@
 	"movq %%mm1, %%mm7             \n\t"
 	"psraw $2, %%mm2              \n\t"
 
-	"psubw "DCTSIZE_S"*6*2(%%esi), %%mm4 \n\t"
+	"psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
 	"psubw %%mm6, %%mm1            \n\t"
 
-	"psubw "DCTSIZE_S"*7*2(%%esi), %%mm0 \n\t"
+	"psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
 	"paddw %%mm7, %%mm6            \n\t" //'t13
 
 	"psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
@@ -1335,10 +1329,10 @@
 	"movq %%mm2, "MANGLE(temps)"+0*8       \n\t" //!
 	"psubw %%mm6, %%mm7            \n\t" //'t3
 
-	"movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
 	"psubw %%mm6, %%mm1            \n\t" //'t12        
 
-	"psubw "DCTSIZE_S"*5*2(%%esi), %%mm2 \n\t" //t5
+	"psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
 	"movq %%mm5, %%mm6             \n\t"
 
 	"movq %%mm7, "MANGLE(temps)"+3*8       \n\t"
@@ -1371,7 +1365,7 @@
 	"movq %%mm5, "MANGLE(temps)"+1*8       \n\t"
 	"paddw %%mm3, %%mm4            \n\t" //z4
 
-	"movq 1*8+3*16(%%edx), %%mm3   \n\t"
+	"movq 1*8+3*16(%%"REG_d"), %%mm3   \n\t"
 	"movq %%mm0, %%mm1             \n\t"
 
 	"movq %%mm6, "MANGLE(temps)"+2*8       \n\t"
@@ -1381,13 +1375,13 @@
 	"paddw %%mm2, %%mm0            \n\t" //z11 
 	"movq %%mm1, %%mm5             \n\t"
 
-	"movq 1*8+5*16(%%edx), %%mm2   \n\t"
+	"movq 1*8+5*16(%%"REG_d"), %%mm2   \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //d3
 
 	"paddw %%mm7, %%mm5            \n\t" //d5
 	"psubw %%mm3, %%mm1            \n\t"
 
-	"movq 1*8+1*16(%%edx), %%mm7   \n\t"
+	"movq 1*8+1*16(%%"REG_d"), %%mm7   \n\t"
 	"psubw %%mm2, %%mm5            \n\t"
 
 	"movq %%mm0, %%mm6             \n\t"
@@ -1397,7 +1391,7 @@
 	"psubw %%mm4, %%mm6            \n\t" //d7  
 
 	// d1 d3 - - - d5 d7 -    
-	"movq 1*8+7*16(%%edx), %%mm4   \n\t"
+	"movq 1*8+7*16(%%"REG_d"), %%mm4   \n\t"
 	"psubw %%mm7, %%mm0            \n\t"
 
 	"psubw %%mm4, %%mm6            \n\t"
@@ -1424,8 +1418,8 @@
 	"packssdw %%mm4, %%mm4         \n\t"
 	"psubusw %%mm7, %%mm0          \n\t"
 
-	"movd %%mm4, %%eax             \n\t"
-	"orl %%eax, %%eax              \n\t"
+	"movd %%mm4, %%"REG_a"             \n\t"
+	"or %%"REG_a", %%"REG_a"              \n\t"
 	"jnz 3f                 \n\t"
 	//movq [edi+"DCTSIZE_S"*3*2], mm1
 	//movq [edi+"DCTSIZE_S"*5*2], mm5
@@ -1440,7 +1434,7 @@
 	"pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
 	"movq %%mm1, %%mm2             \n\t"
 
-	"movq "DCTSIZE_S"*0*2(%%edi), %%mm5 \n\t"
+	"movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
 	"movq %%mm2, %%mm3             \n\t"
 
 	"pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
@@ -1453,50 +1447,50 @@
 	"pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
 	"psubw %%mm3, %%mm4            \n\t"
 
-	"movq "DCTSIZE_S"*1*2(%%edi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
 	"paddw %%mm3, %%mm5            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*7*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
 	"paddw %%mm6, %%mm7            \n\t"
 
 	"movq "MANGLE(temps)"+2*8, %%mm3       \n\t"
 	"psubw %%mm0, %%mm6            \n\t"
 
-	"movq "DCTSIZE_S"*2*2(%%edi), %%mm4 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
 	"paddw %%mm0, %%mm7            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*0*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
 	"paddw %%mm3, %%mm4            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*6*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
 	"psubw %%mm1, %%mm3            \n\t"
 
-	"movq "DCTSIZE_S"*5*2(%%edi), %%mm5 \n\t"
+	"movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
 	"paddw %%mm1, %%mm4            \n\t"
 
-	"movq "DCTSIZE_S"*3*2(%%edi), %%mm6 \n\t"
+	"movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
 	"paddw %%mm3, %%mm5            \n\t"
 
 	"movq "MANGLE(temps)"+3*8, %%mm0       \n\t"
-	"addl $24, %%esi              \n\t"
+	"add $24, %%"REG_S"              \n\t"
 
-	"movq %%mm7, "DCTSIZE_S"*1*2(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
 	"paddw %%mm0, %%mm6            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*2*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
 	"psubw %%mm2, %%mm0            \n\t"
 
-	"movq "DCTSIZE_S"*4*2(%%edi), %%mm7 \n\t"
+	"movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
 	"paddw %%mm2, %%mm6            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*5*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
 	"paddw %%mm0, %%mm7            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*3*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
 
-	"movq %%mm7, "DCTSIZE_S"*4*2(%%edi) \n\t"
-	"addl $24, %%edi              \n\t"
-	"subl $2, %%ecx               \n\t"
+	"movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+	"add $24, %%"REG_D"              \n\t"
+	"sub $2, %%"REG_c"               \n\t"
 	"jnz 1b                \n\t"
 	"jmp 5f                   \n\t"
 
@@ -1539,16 +1533,16 @@
 	//paddw mm7, MM_2
 	"psraw $2, %%mm7              \n\t"
 
-	"paddw "DCTSIZE_S"*0*2(%%edi), %%mm4 \n\t"
+	"paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
 	"psubw %%mm7, %%mm6            \n\t"
 
 	"movq "MANGLE(temps)"+1*8, %%mm3       \n\t"
 	"paddw %%mm7, %%mm4            \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*7*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
 	"paddw %%mm5, %%mm1            \n\t" //'t12
 
-	"movq %%mm4, "DCTSIZE_S"*0*2(%%edi) \n\t"
+	"movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
 	"psubw %%mm7, %%mm1            \n\t" //'t6
 
 	"movq "MANGLE(temps)"+2*8, %%mm7       \n\t"
@@ -1557,48 +1551,48 @@
 	"movq "MANGLE(temps)"+3*8, %%mm6       \n\t"
 	"movq %%mm3, %%mm5             \n\t"
 
-	"paddw "DCTSIZE_S"*1*2(%%edi), %%mm3 \n\t"
+	"paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
 	"psubw %%mm1, %%mm5            \n\t"
 
 	"psubw %%mm1, %%mm2            \n\t" //'t5
 	"paddw %%mm1, %%mm3            \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*6*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
 	"movq %%mm7, %%mm4             \n\t"
 
-	"paddw "DCTSIZE_S"*2*2(%%edi), %%mm7 \n\t"
+	"paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
 	"psubw %%mm2, %%mm4            \n\t"
 
-	"paddw "DCTSIZE_S"*5*2(%%edi), %%mm4 \n\t"
+	"paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
 	"paddw %%mm2, %%mm7            \n\t"
 
-	"movq %%mm3, "DCTSIZE_S"*1*2(%%edi) \n\t"
+	"movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
 	"paddw %%mm2, %%mm0            \n\t" //'t4     
 
 	// 't4 't6 't5 - - - - 't7
-	"movq %%mm7, "DCTSIZE_S"*2*2(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
 	"movq %%mm6, %%mm1             \n\t"
 
-	"paddw "DCTSIZE_S"*4*2(%%edi), %%mm6 \n\t"
+	"paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
 	"psubw %%mm0, %%mm1            \n\t"
 
-	"paddw "DCTSIZE_S"*3*2(%%edi), %%mm1 \n\t"
+	"paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
 	"paddw %%mm0, %%mm6            \n\t"
 
-	"movq %%mm4, "DCTSIZE_S"*5*2(%%edi) \n\t"
-	"addl $24, %%esi              \n\t"
+	"movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+	"add $24, %%"REG_S"              \n\t"
 
-	"movq %%mm6, "DCTSIZE_S"*4*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
 
-	"movq %%mm1, "DCTSIZE_S"*3*2(%%edi) \n\t"
-	"addl $24, %%edi              \n\t"
-	"subl $2, %%ecx               \n\t"
+	"movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+	"add $24, %%"REG_D"              \n\t"
+	"sub $2, %%"REG_c"               \n\t"
 	"jnz 1b                \n\t"
 	"5:                      \n\t"
 
 	: "+S"(data), "+D"(output), "+c"(cnt)// input regs
 	: "d"(thr_adr)
-	: "%eax"
+	: "%"REG_a
 	);
 }
 
@@ -1675,18 +1669,18 @@
 			  int16_t* output_adr,  int output_stride,  int cnt)
 {
     asm volatile(
-	"leal (%%eax,%%eax,2), %%edx    \n\t"
+	"lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
 	"1:                     \n\t"
-	"movq "DCTSIZE_S"*0*2(%%esi), %%mm0 \n\t"
+	"movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
 	//
 
-	"movq "DCTSIZE_S"*1*2(%%esi), %%mm1 \n\t"
+	"movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
 	"movq %%mm0, %%mm4             \n\t"
 
-	"movq "DCTSIZE_S"*2*2(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
 	"punpcklwd %%mm1, %%mm0        \n\t"
 
-	"movq "DCTSIZE_S"*3*2(%%esi), %%mm3 \n\t"
+	"movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
 	"punpckhwd %%mm1, %%mm4        \n\t"
 
 	//transpose 4x4
@@ -1714,10 +1708,10 @@
 	"psllw $2, %%mm0              \n\t"
 	"paddw %%mm2, %%mm4            \n\t" //t10
 
-	"movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%esi), %%mm3 \n\t"
+	"movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
 	"psubw %%mm2, %%mm1            \n\t" //t11
 
-	"movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%esi), %%mm2 \n\t"
+	"movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
 	"psubw %%mm5, %%mm0            \n\t"
 
 	"movq %%mm4, %%mm6             \n\t"
@@ -1726,7 +1720,7 @@
 	"psubw %%mm5, %%mm6            \n\t" //t3
 	"movq %%mm1, %%mm7             \n\t"
 
-	"movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%esi), %%mm5 \n\t"
+	"movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
 	"paddw %%mm0, %%mm1            \n\t" //t1
 
 	"movq %%mm4, "MANGLE(temps)"+0*8       \n\t" //t0
@@ -1736,7 +1730,7 @@
 	"punpcklwd %%mm2, %%mm3        \n\t"
 
 	//transpose 4x4    
-	"movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%esi), %%mm6 \n\t"
+	"movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
 	"punpckhwd %%mm2, %%mm4        \n\t"
 
 	"movq %%mm5, %%mm2             \n\t"
@@ -1812,61 +1806,61 @@
 	"paddw %%mm2, %%mm7            \n\t"
 	"psraw $3, %%mm5              \n\t"
 
-	"paddw (%%edi), %%mm5          \n\t"
+	"paddw (%%"REG_D"), %%mm5          \n\t"
 	"psraw $3, %%mm7              \n\t"
 
-	"paddw (%%edi,%%eax,), %%mm1    \n\t"
+	"paddw (%%"REG_D",%%"REG_a",), %%mm1    \n\t"
 	"paddw %%mm2, %%mm0            \n\t"
 
-	"paddw (%%edi,%%eax,2), %%mm7   \n\t"
+	"paddw (%%"REG_D",%%"REG_a",2), %%mm7   \n\t"
 	"paddw %%mm2, %%mm3            \n\t"
 
-	"movq %%mm5, (%%edi)           \n\t"
+	"movq %%mm5, (%%"REG_D")           \n\t"
 	"paddw %%mm2, %%mm6            \n\t"
 
-	"movq %%mm1, (%%edi,%%eax,)     \n\t"
+	"movq %%mm1, (%%"REG_D",%%"REG_a",)     \n\t"
 	"psraw $3, %%mm0              \n\t"
 
-	"movq %%mm7, (%%edi,%%eax,2)    \n\t"
-	"addl %%edx, %%edi             \n\t" //3*ls
+	"movq %%mm7, (%%"REG_D",%%"REG_a",2)    \n\t"
+	"add %%"REG_d", %%"REG_D"             \n\t" //3*ls
 
 	"movq "MANGLE(temps)"+1*8, %%mm5       \n\t" //t3
 	"psraw $3, %%mm3              \n\t"
 
-	"paddw (%%edi,%%eax,2), %%mm0   \n\t"
+	"paddw (%%"REG_D",%%"REG_a",2), %%mm0   \n\t"
 	"psubw %%mm4, %%mm5            \n\t" //d3
 
-	"paddw (%%edi,%%edx,), %%mm3    \n\t"
+	"paddw (%%"REG_D",%%"REG_d",), %%mm3    \n\t"
 	"psraw $3, %%mm6              \n\t"
 
 	"paddw "MANGLE(temps)"+1*8, %%mm4      \n\t" //d4        
 	"paddw %%mm2, %%mm5            \n\t"
 
-	"paddw (%%edi,%%eax,4), %%mm6   \n\t"
+	"paddw (%%"REG_D",%%"REG_a",4), %%mm6   \n\t"
 	"paddw %%mm2, %%mm4            \n\t"
 
-	"movq %%mm0, (%%edi,%%eax,2)    \n\t"
+	"movq %%mm0, (%%"REG_D",%%"REG_a",2)    \n\t"
 	"psraw $3, %%mm5              \n\t"
 
-	"paddw (%%edi), %%mm5          \n\t"
+	"paddw (%%"REG_D"), %%mm5          \n\t"
 	"psraw $3, %%mm4              \n\t"
 
-	"paddw (%%edi,%%eax,), %%mm4    \n\t"
-	"addl $"DCTSIZE_S"*2*4, %%esi      \n\t" //4 rows
+	"paddw (%%"REG_D",%%"REG_a",), %%mm4    \n\t"
+	"add $"DCTSIZE_S"*2*4, %%"REG_S"      \n\t" //4 rows
 
-	"movq %%mm3, (%%edi,%%edx,)     \n\t"
-	"movq %%mm6, (%%edi,%%eax,4)    \n\t"
-	"movq %%mm5, (%%edi)           \n\t"
-	"movq %%mm4, (%%edi,%%eax,)     \n\t"
-
-	"subl %%edx, %%edi             \n\t"
-	"addl $8, %%edi               \n\t"
-	"decl %%ecx                   \n\t"
+	"movq %%mm3, (%%"REG_D",%%"REG_d",)     \n\t"
+	"movq %%mm6, (%%"REG_D",%%"REG_a",4)    \n\t"
+	"movq %%mm5, (%%"REG_D")           \n\t"
+	"movq %%mm4, (%%"REG_D",%%"REG_a",)     \n\t"
+
+	"sub %%"REG_d", %%"REG_D"             \n\t"
+	"add $8, %%"REG_D"               \n\t"
+	"dec %%"REG_c"                   \n\t"
 	"jnz 1b                  \n\t"
 
 	: "+S"(workspace), "+D"(output_adr), "+c"(cnt) //input regs
 	: "a"(output_stride*sizeof(short))
-	: "%edx"
+	: "%"REG_d
 	);
 }
 
@@ -1940,27 +1934,27 @@
 static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt)
 {
     asm volatile(
-	"leal (%%eax,%%eax,2), %%edx    \n\t"
+	"lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
 	"6:                     \n\t"
-	"movd (%%esi), %%mm0           \n\t"
+	"movd (%%"REG_S"), %%mm0           \n\t"
 	"pxor %%mm7, %%mm7             \n\t"
 
-	"movd (%%esi,%%eax,), %%mm1     \n\t"
+	"movd (%%"REG_S",%%"REG_a",), %%mm1     \n\t"
 	"punpcklbw %%mm7, %%mm0        \n\t"
 
-	"movd (%%esi,%%eax,2), %%mm2    \n\t"
+	"movd (%%"REG_S",%%"REG_a",2), %%mm2    \n\t"
 	"punpcklbw %%mm7, %%mm1        \n\t"
 
 	"punpcklbw %%mm7, %%mm2        \n\t"
-	"addl %%edx, %%esi             \n\t"
+	"add %%"REG_d", %%"REG_S"             \n\t"
 
 	"movq %%mm0, %%mm5             \n\t"
 	//       
 
-	"movd (%%esi,%%eax,4), %%mm3    \n\t" //7  ;prefetch!
+	"movd (%%"REG_S",%%"REG_a",4), %%mm3    \n\t" //7  ;prefetch!
 	"movq %%mm1, %%mm6             \n\t"
 
-	"movd (%%esi,%%edx,), %%mm4     \n\t" //6
+	"movd (%%"REG_S",%%"REG_d",), %%mm4     \n\t" //6
 	"punpcklbw %%mm7, %%mm3        \n\t"
 
 	"psubw %%mm3, %%mm5            \n\t"
@@ -1969,7 +1963,7 @@
 	"paddw %%mm3, %%mm0            \n\t"
 	"psubw %%mm4, %%mm6            \n\t"
 
-	"movd (%%esi,%%eax,2), %%mm3    \n\t" //5
+	"movd (%%"REG_S",%%"REG_a",2), %%mm3    \n\t" //5
 	"paddw %%mm4, %%mm1            \n\t"
 
 	"movq %%mm5, "MANGLE(temps)"+0*8       \n\t" //t7
@@ -1978,10 +1972,10 @@
 	"movq %%mm6, "MANGLE(temps)"+1*8       \n\t" //t6
 	"movq %%mm2, %%mm4             \n\t"
 
-	"movd (%%esi), %%mm5           \n\t" //3
+	"movd (%%"REG_S"), %%mm5           \n\t" //3
 	"paddw %%mm3, %%mm2            \n\t"
 
-	"movd (%%esi,%%eax,), %%mm6     \n\t" //4
+	"movd (%%"REG_S",%%"REG_a",), %%mm6     \n\t" //4
 	"punpcklbw %%mm7, %%mm5        \n\t"
 
 	"psubw %%mm3, %%mm4            \n\t"
@@ -2033,16 +2027,16 @@
 	"punpckhdq %%mm7, %%mm5        \n\t" //1
 	"movq %%mm6, %%mm7             \n\t"
 
-	"movq %%mm0, "DCTSIZE_S"*0*2(%%edi) \n\t"
+	"movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
 	"punpckldq %%mm2, %%mm6        \n\t" //2     
 
-	"movq %%mm5, "DCTSIZE_S"*1*2(%%edi) \n\t"
+	"movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
 	"punpckhdq %%mm2, %%mm7        \n\t" //3    
 
-	"movq %%mm6, "DCTSIZE_S"*2*2(%%edi) \n\t"
+	"movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
 	"paddw %%mm1, %%mm4            \n\t"
 
-	"movq %%mm7, "DCTSIZE_S"*3*2(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
 	"psllw $2, %%mm3              \n\t" //t10    
 
 	"movq "MANGLE(temps)"+0*8, %%mm2       \n\t"
@@ -2089,28 +2083,28 @@
 	"movq %%mm2, %%mm7             \n\t"
 
 	"punpckldq %%mm5, %%mm2        \n\t" //4
-	"subl %%edx, %%esi             \n\t"
+	"sub %%"REG_d", %%"REG_S"             \n\t"
 
 	"punpckhdq %%mm5, %%mm7        \n\t" //5
 	"movq %%mm4, %%mm5             \n\t"
 
-	"movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%edi) \n\t"
+	"movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
 	"punpckldq %%mm6, %%mm4        \n\t" //6
 
-	"movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%edi) \n\t"
+	"movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
 	"punpckhdq %%mm6, %%mm5        \n\t" //7    
 
-	"movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%edi) \n\t"
-	"addl $4, %%esi               \n\t"
+	"movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+	"add $4, %%"REG_S"               \n\t"
 
-	"movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%edi) \n\t"
-	"addl $"DCTSIZE_S"*2*4, %%edi      \n\t" //4 rows    
-	"decl %%ecx                   \n\t"
+	"movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+	"add $"DCTSIZE_S"*2*4, %%"REG_D"      \n\t" //4 rows    
+	"dec %%"REG_c"                   \n\t"
 	"jnz 6b                  \n\t"
 
 	: "+S"(pixels), "+D"(data), "+c"(cnt) //input regs
 	: "a"(line_size)
-	: "%edx");
+	: "%"REG_d);
 }
 
 #endif // HAVE_MMX


More information about the MPlayer-dev-eng mailing list