[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, the Finale

Zuxy Meng zuxy.meng at gmail.com
Wed Jun 6 08:33:28 CEST 2007


Hi,

2007/5/20, Zuxy Meng <zuxy.meng at gmail.com>:
> As discussed with Guillaume on IRC, I'll split my previous big patch
> (Rewrite synth_1to1_MMX....) into several small parts for easier
> review. Here's the first one, rewriting the generic code in
> synth_1to1_MMX from assembly to C, so we don't need to deal with
> different ABIs. I've tested it and confirmed it doesn't hurt
> performance.
>
> Note I removed a conditional jump in the remaining assembly too. By
> analyzing the code I'm sure it's never taken so don't worry about
> that. Strictly speaking it should be in a seperate patch but then this
> patch would break mplayer...
>
> Part 2 will replace 32-bit leal to equivalent add/sub (without the 'l'
> suffix) so pointer arithmetic will be 64-bit under amd64.
>
> Part 3 will remove hardcoded registers.
>
> Part 4 will kill tabinit_mmx.c. We don't need to compute the table at
> runtime; it can be predetermined.
>
> Part 5 will correct data types, replacing 'long' with 'int' where necessary.
>
> The last patch will deal with Makefile and macros.

The attached patch modifies macros and the Makefile, effectively
turning everything on for AMD64. The result is 47% faster decoding on
a K8.

Please note that due to mp3lib's floating nature, the output won't be
bit-exact w/ and w/o the patch. But this shouldn't hurt the sound
quality.
-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c	?????? 23483??
+++ mp3lib/sr1.c	????????????
@@ -32,8 +32,9 @@
 
 #include "libvo/fastmemcpy.h"
 
-#ifdef ARCH_X86_32
-#define CAN_COMPILE_X86_ASM
+#ifdef ARCH_X86_64
+#undef HAVE_3DNOW
+#undef HAVE_3DNOWEX
 #endif
 
 //static FILE* mp3_file=NULL;
@@ -137,7 +138,7 @@
 //  if(MP3_frames>=7741) printf("getbits_fast: bits=%d  bitsleft=%d  wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
   if((bitsleft-=number_of_bits)<0) return 0;
   if(!number_of_bits) return 0;
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
   rval = bswap_16(*((uint16_t *)wordpointer));
 #else
   /*
@@ -180,7 +181,7 @@
 
 LOCAL int stream_head_read(unsigned char *hbuf,uint32_t *newhead){
   if(mp3_read(hbuf,4) != 4) return FALSE;
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
   *newhead = bswap_32(*((uint32_t*)hbuf));
 #else
   /*
@@ -415,8 +416,6 @@
 
     make_decode_tables(outscale);
 
-#ifdef CAN_COMPILE_X86_ASM
-
 #ifdef HAVE_MMX
     if (gCpuCaps.hasMMX)
     {
@@ -459,6 +458,7 @@
     }
     else
 #endif
+#ifdef ARCH_X86_32
     if (gCpuCaps.cpuType >= CPUTYPE_I586)
     {
 	synth_func = synth_1to1_pent;
Index: mp3lib/decod386.c
===================================================================
--- mp3lib/decod386.c	?????? 23483??
+++ mp3lib/decod386.c	????????????
@@ -102,7 +102,7 @@
 
 static synth_func_t synth_func;
 
-#if defined(CAN_COMPILE_X86_ASM) && defined(HAVE_MMX)
+#ifdef HAVE_MMX
 extern int synth_1to1_MMX( real *bandPtr,int channel,short * samples);
 #endif
 
@@ -125,7 +125,7 @@
   *pnt += 128;
 
 /* optimized for x86 */
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
   if ( synth_func )
    {
 //    printf("Calling %p, bandPtr=%p channel=%d samples=%p\n",synth_func,bandPtr,channel,samples);
Index: mp3lib/Makefile
===================================================================
--- mp3lib/Makefile	?????? 23483??
+++ mp3lib/Makefile	????????????
@@ -3,18 +3,23 @@
 LIBNAME_COMMON = libmp3.a
 
 SRCS_COMMON = sr1.c
+ifeq ($(TARGET_ARCH_X86),yes)
+SRCS_COMMON-$(TARGET_MMX)     += decode_MMX.c
+SRCS_COMMON-$(TARGET_SSE)     += dct64_sse.c
 ifeq ($(TARGET_ARCH_X86_32),yes)
 SRCS_COMMON                   += decode_i586.c
-SRCS_COMMON-$(TARGET_MMX)     += decode_MMX.c dct64_MMX.c
+SRCS_COMMON-$(TARGET_MMX)     += dct64_MMX.c
 SRCS_COMMON-$(TARGET_3DNOW)   += dct36_3dnow.c dct64_3dnow.c
 SRCS_COMMON-$(TARGET_3DNOWEX) += dct36_k7.c dct64_k7.c
-SRCS_COMMON-$(TARGET_SSE)     += dct64_sse.c
 endif
+endif
 SRCS_COMMON-$(TARGET_ALTIVEC) += dct64_altivec.c
 
 include ../mpcommon.mak
 
+ifeq ($(TARGET_ARCH_X86_32),yes)
 decode_i586.o: CFLAGS += -fomit-frame-pointer
+endif
 
 %: %.c $(LIBNAME_COMMON) ../libvo/aclib.o ../mp_msg-mencoder.o ../cpudetect.o ../osdep/getch2.o
 	$(CC) $(CFLAGS) -o $@ $^ -ltermcap -lm


More information about the MPlayer-dev-eng mailing list