[FFmpeg-devel] r9017 breaks WMA decoding on Intel Macs

Trent Piepho xyzzy
Mon May 28 01:26:19 CEST 2007


On Mon, 28 May 2007, Guillaume POIRIER wrote:
>On 5/28/07, Trent Piepho <xyzzy at speakeasy.org> wrote:
>> On Sun, 27 May 2007, Guillaume POIRIER wrote:
>> >On 5/27/07, Zuxy Meng <zuxy.meng at gmail.com> wrote:
>> >> 2007/5/27, Guillaume Poirier <gpoirier at mplayerhq.hu>:
>> >
>> >> Then please check things like "8+%0" "-16+%1", replace constraints
>> >> from "m" to "r" and rewrite using "8(%0)" "-16(%1)". Maybe Apple's
>> >> binutils doesn't like such syntax.
>> >
>> >I guess I suck at doing such things too. Attached patch is an attempt
>> >to do what you suggest, be it doesn't assemble, either on Linux x86-64
>> >or OSX x86
>>
>> When you change from "m" to "r" you're also changing from the lvalue itself
>> to a pointer to the lvalue.
>>
>> This results in less efficient code, since you preclude using SIB
>> addressing and might need an extra register.  If the apple version of gas
>> doesn't like the syntax, it should generate an error.
>
>
>The said syntax seems to be supported, since there's no error
>reported. Trent, you mean that it shouldn't be necessary to replace
>"8+%0" "-16+%1", to "8(%0)" "-16(%1)" ?
>
>When I do this change, and this change alone (no "m" to "r" contrain),
>it doesn't assemble either.

In general you can write:
 int x, foo[16];
 asm("mov 4+%1, %0" : "r"(x) : "m"(foo[4]));

Or you can write this:
 int x, foo[16];
 asm("mov 4(%1), %0" : "r"(x) : "r"(foo + 4));
 asm("mov 4(%1), %0" : "r"(x) : "r"(&foo[4]));  // same as previous line

The first form, with "4+%1" is better.  It can result in more efficient
code, and is some cases use one less register, which can be very important.

Both ways of writing this are slightly broken though.  If you also touch
foo from some C code, the optimizer could produce incorrect code.  This
is better:
 int x, foo[16];
 asm("mov %2, %0" : "r"(x) : "m"(foo[4]), "m"(foo[4+1]));

I'm attaching a patch showing the right way to do.

But I don't think this is the problem.  I think it's more likely that the
apple compiler is compiling some C code into something that uses SSE, and
this is breaking the asm blocks which assume that nothing else uses SSE
registers.
-------------- next part --------------
Index: fft_sse.c
===================================================================
--- fft_sse.c	(revision 9144)
+++ fft_sse.c	(working copy)
@@ -158,7 +158,7 @@
     asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
 #define P1M1P1M1 "%%xmm8"
 #else
-#define P1M1P1M1 "%4"
+#define P1M1P1M1 "%8"
 #endif
 
     /* pre rotation */
@@ -170,12 +170,12 @@
         asm volatile (
             "movaps          %0, %%xmm0 \n\t"   // xmm0 = r0 X  r1 X : in2
             "movaps          %1, %%xmm3 \n\t"   // xmm3 = X  i1 X  i0: in1
-            "movaps      -16+%0, %%xmm4 \n\t"   // xmm4 = r0 X  r1 X : in2
-            "movaps       16+%1, %%xmm7 \n\t"   // xmm7 = X  i1 X  i0: in1
+            "movaps          %4, %%xmm4 \n\t"   // xmm4 = r0 X  r1 X : in2
+            "movaps          %5, %%xmm7 \n\t"   // xmm7 = X  i1 X  i0: in1
             "movlps          %2, %%xmm1 \n\t"   // xmm1 = X  X  R1 R0: tcos
             "movlps          %3, %%xmm2 \n\t"   // xmm2 = X  X  I1 I0: tsin
-            "movlps        8+%2, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
-            "movlps        8+%3, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
+            "movlps          %6, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
+            "movlps          %7, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
             "shufps $95, %%xmm0, %%xmm0 \n\t"   // xmm0 = r1 r1 r0 r0
             "shufps $160,%%xmm3, %%xmm3 \n\t"   // xmm3 = i1 i1 i0 i0
             "shufps $95, %%xmm4, %%xmm4 \n\t"   // xmm4 = r1 r1 r0 r0
@@ -195,13 +195,15 @@
             "addps       %%xmm3, %%xmm0 \n\t"   // xmm0 = result
             "addps       %%xmm7, %%xmm4 \n\t"   // xmm4 = result
             ::"m"(in2[-2*k]), "m"(in1[2*k]),
-              "m"(tcos[k]), "m"(tsin[k])
+              "m"(tcos[k]), "m"(tsin[k]),
+              "m"(in2[-2*k - 4]), "m"(in1[2*k + 4])
+              "m"(tcos[k+2], "m"(tsin[k+2]),
 #ifndef ARCH_X86_64
               ,"m"(*p1m1p1m1)
 #endif
         );
         /* Should be in the same block, hack for gcc2.95 & gcc3 */
-        asm (
+        asm volatile (
             "movlps      %%xmm0, %0     \n\t"
             "movhps      %%xmm0, %1     \n\t"
             "movlps      %%xmm4, %2     \n\t"
@@ -215,20 +217,20 @@
 
 #ifndef ARCH_X86_64
 #undef P1M1P1M1
-#define P1M1P1M1 "%3"
+#define P1M1P1M1 "%6"
 #endif
 
     /* post rotation + reordering */
     for (k = 0; k < n4; k += 4) {
         asm (
             "movaps          %0, %%xmm0 \n\t"   // xmm0 = i1 r1 i0 r0: z
-            "movaps       16+%0, %%xmm4 \n\t"   // xmm4 = i1 r1 i0 r0: z
-            "movlps          %1, %%xmm1 \n\t"   // xmm1 = X  X  R1 R0: tcos
-            "movlps        8+%1, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
+            "movaps          %1, %%xmm4 \n\t"   // xmm4 = i1 r1 i0 r0: z
+            "movlps          %2, %%xmm1 \n\t"   // xmm1 = X  X  R1 R0: tcos
+            "movlps          %4, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
             "movaps      %%xmm0, %%xmm3 \n\t"   // xmm3 = i1 r1 i0 r0
             "movaps      %%xmm4, %%xmm7 \n\t"   // xmm7 = i1 r1 i0 r0
-            "movlps          %2, %%xmm2 \n\t"   // xmm2 = X  X  I1 I0: tsin
-            "movlps        8+%2, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
+            "movlps          %3, %%xmm2 \n\t"   // xmm2 = X  X  I1 I0: tsin
+            "movlps          %5, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
             "shufps $160,%%xmm0, %%xmm0 \n\t"   // xmm0 = r1 r1 r0 r0
             "shufps $245,%%xmm3, %%xmm3 \n\t"   // xmm3 = i1 i1 i0 i0
             "shufps $160,%%xmm4, %%xmm4 \n\t"   // xmm4 = r1 r1 r0 r0
@@ -248,9 +250,10 @@
             "addps       %%xmm3, %%xmm0 \n\t"   // xmm0 = result
             "addps       %%xmm7, %%xmm4 \n\t"   // xmm4 = result
             "movaps      %%xmm0, %0     \n\t"
-            "movaps      %%xmm4, 16+%0  \n\t"
-            :"+m"(z[k])
-            :"m"(tcos[k]), "m"(tsin[k])
+            "movaps      %%xmm4, %1     \n\t"
+            :"+m"(z[k]), "+m"(z[k+2])
+            :"m"(tcos[k]), "m"(tsin[k]),
+             "m"(tcos[k+2]), "m"(tsin[k+2])
 #ifndef ARCH_X86_64
              ,"m"(*p1m1p1m1)
 #endif



More information about the ffmpeg-devel mailing list