[MPlayer-dev-eng] [PATCH] fixed point faad, gnu assembler etc.

Wed Apr 19 22:28:35 CEST 2006

Hello,
I think this patch will make the biggest speed difference for faad,
though it will reduce the quality.
I also think it might break some files (SBR not supported in fixed point
mode, could that be?).
So please test extensively.

Greetings,
Reimar Döffinger
-------------- next part --------------
Index: libfaad2/Makefile
===================================================================
RCS file: /cvsroot/mplayer/main/libfaad2/Makefile,v
retrieving revision 1.10
diff -u -r1.10 Makefile

--- libfaad2/Makefile	18 Apr 2006 19:39:29 -0000	1.10
+++ libfaad2/Makefile	19 Apr 2006 09:46:21 -0000
@@ -48,7 +48,7 @@
 
 # Uncomment this to use the FIXED_POINT implementation of FAAD2.
 # This should improve performance, especially for SBR files.
-#CFLAGS  = -I. $(OPTFLAGS) -DFIXED_POINT
+CFLAGS  = -I. $(OPTFLAGS) -DFIXED_POINT
 
 .SUFFIXES: .c .o
 
Index: libfaad2/fixed.h
===================================================================
RCS file: /cvsroot/mplayer/main/libfaad2/fixed.h,v
retrieving revision 1.7
diff -u -r1.7 fixed.h
--- libfaad2/fixed.h	18 Apr 2006 19:39:30 -0000	1.7
+++ libfaad2/fixed.h	19 Apr 2006 09:46:29 -0000
@@ -226,12 +226,62 @@
     *y2 = yt2 << (FRAC_SIZE-FRAC_BITS);
 }
 
+#elif defined(__GNUC__) && (defined (ARCH_X86) || defined(ARCH_X86_64))
+#define MUL_S(A,B,S) \
+  __asm__ __volatile__ (\
+    "imul %1                 \n\t"\
+    "shrd %%cl, %%edx, %%eax \n\t"\
+    : "+a" (A) : "r" (B), "c" (S) : "%edx");
+
+  static INLINE real_t MUL_R(real_t A, real_t B) {
+    MUL_S(A, B, REAL_BITS);
+    return A;
+  }
+
+  static INLINE real_t MUL_C(real_t A, real_t B) {
+    MUL_S(A, B, COEF_BITS);
+    return A;
+  }
+
+  static INLINE real_t MUL_F(real_t A, real_t B) {
+    MUL_S(A, B, FRAC_BITS);
+    return A;
+  }
+
+  static INLINE real_t MUL_Q2(real_t A, real_t B) {
+    MUL_S(A, B, Q2_BITS);
+    return A;
+  }
+
+  static INLINE real_t MUL_SHIFT6(real_t A, real_t B) {
+    MUL_S(A, B, 6);
+    return A;
+  }
+
+  static INLINE real_t MUL_SHIFT23(real_t A, real_t B) {
+    MUL_S(A, B, 23);
+    return A;
+  }
+
+  static INLINE real_t _MulHigh(real_t A, real_t B) {
+    __asm__ __volatile__ (\
+      "imul %1                 \n\t"\
+      "mov %%edx, %%eax        \n\t"\
+      : "+a" (A) : "r" (B) : "%edx");
+    return A;
+  }
+
+  static INLINE void ComplexMult(real_t *y1, real_t *y2, real_t x1, real_t x2,
+                                 real_t c1, real_t c2) {
+    *y1 = (_MulHigh(x1, c1) + _MulHigh(x2, c2))<<(FRAC_SIZE-FRAC_BITS);
+    *y2 = (_MulHigh(x2, c1) - _MulHigh(x1, c2))<<(FRAC_SIZE-FRAC_BITS);
+  }
 #else
 
   /* multiply with real shift */
-  #define MUL_R(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (REAL_BITS-1))) >> REAL_BITS)
+  #define MUL_R(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> REAL_BITS)
   /* multiply with coef shift */
-  #define MUL_C(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (COEF_BITS-1))) >> COEF_BITS)
+  #define MUL_C(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> COEF_BITS)
   /* multiply with fractional shift */
 #if defined(_WIN32_WCE) && defined(_ARM_)
   /* eVC for PocketPC has an intrinsic function that returns only the high 32 bits of a 32x32 bit multiply */
@@ -240,12 +290,12 @@
       return _MulHigh(A,B) << (32-FRAC_BITS);
   }
 #else
-  #define _MulHigh(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (FRAC_SIZE-1))) >> FRAC_SIZE)
-  #define MUL_F(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (FRAC_BITS-1))) >> FRAC_BITS)
+  #define _MulHigh(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> FRAC_SIZE)
+  #define MUL_F(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> FRAC_BITS)
 #endif
-  #define MUL_Q2(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (Q2_BITS-1))) >> Q2_BITS)
-  #define MUL_SHIFT6(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (6-1))) >> 6)
-  #define MUL_SHIFT23(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)+(1 << (23-1))) >> 23)
+  #define MUL_Q2(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> Q2_BITS)
+  #define MUL_SHIFT6(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> 6)
+  #define MUL_SHIFT23(A,B) (real_t)(((int64_t)(A)*(int64_t)(B)) >> 23)
 
 /* Complex multiplication */
 static INLINE void ComplexMult(real_t *y1, real_t *y2,