[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, the Finale
Zuxy Meng
zuxy.meng at gmail.com
Wed Jun 6 08:33:28 CEST 2007
Hi,
2007/5/20, Zuxy Meng <zuxy.meng at gmail.com>:
> As discussed with Guillaume on IRC, I'll split my previous big patch
> (Rewrite synth_1to1_MMX....) into several small parts for easier
> review. Here's the first one, rewriting the generic code in
> synth_1to1_MMX from assembly to C, so we don't need to deal with
> different ABIs. I've tested it and confirmed it doesn't hurt
> performance.
>
> Note I removed a conditional jump in the remaining assembly too. By
> analyzing the code I'm sure it's never taken so don't worry about
> that. Strictly speaking it should be in a seperate patch but then this
> patch would break mplayer...
>
> Part 2 will replace 32-bit leal to equivalent add/sub (without the 'l'
> suffix) so pointer arithmetic will be 64-bit under amd64.
>
> Part 3 will remove hardcoded registers.
>
> Part 4 will kill tabinit_mmx.c. We don't need to compute the table at
> runtime; it can be predetermined.
>
> Part 5 will correct data types, replacing 'long' with 'int' where necessary.
>
> The last patch will deal with Makefile and macros.
The attached patch modifies macros and the Makefile, effectively
turning everything on for AMD64. The result is 47% faster decoding on
a K8.
Please note that due to mp3lib's floating nature, the output won't be
bit-exact w/ and w/o the patch. But this shouldn't hurt the sound
quality.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c ?????? 23483??
+++ mp3lib/sr1.c ????????????
@@ -32,8 +32,9 @@
#include "libvo/fastmemcpy.h"
-#ifdef ARCH_X86_32
-#define CAN_COMPILE_X86_ASM
+#ifdef ARCH_X86_64
+#undef HAVE_3DNOW
+#undef HAVE_3DNOWEX
#endif
//static FILE* mp3_file=NULL;
@@ -137,7 +138,7 @@
// if(MP3_frames>=7741) printf("getbits_fast: bits=%d bitsleft=%d wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
if((bitsleft-=number_of_bits)<0) return 0;
if(!number_of_bits) return 0;
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
rval = bswap_16(*((uint16_t *)wordpointer));
#else
/*
@@ -180,7 +181,7 @@
LOCAL int stream_head_read(unsigned char *hbuf,uint32_t *newhead){
if(mp3_read(hbuf,4) != 4) return FALSE;
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
*newhead = bswap_32(*((uint32_t*)hbuf));
#else
/*
@@ -415,8 +416,6 @@
make_decode_tables(outscale);
-#ifdef CAN_COMPILE_X86_ASM
-
#ifdef HAVE_MMX
if (gCpuCaps.hasMMX)
{
@@ -459,6 +458,7 @@
}
else
#endif
+#ifdef ARCH_X86_32
if (gCpuCaps.cpuType >= CPUTYPE_I586)
{
synth_func = synth_1to1_pent;
Index: mp3lib/decod386.c
===================================================================
--- mp3lib/decod386.c ?????? 23483??
+++ mp3lib/decod386.c ????????????
@@ -102,7 +102,7 @@
static synth_func_t synth_func;
-#if defined(CAN_COMPILE_X86_ASM) && defined(HAVE_MMX)
+#ifdef HAVE_MMX
extern int synth_1to1_MMX( real *bandPtr,int channel,short * samples);
#endif
@@ -125,7 +125,7 @@
*pnt += 128;
/* optimized for x86 */
-#if defined(CAN_COMPILE_X86_ASM)
+#ifdef ARCH_X86
if ( synth_func )
{
// printf("Calling %p, bandPtr=%p channel=%d samples=%p\n",synth_func,bandPtr,channel,samples);
Index: mp3lib/Makefile
===================================================================
--- mp3lib/Makefile ?????? 23483??
+++ mp3lib/Makefile ????????????
@@ -3,18 +3,23 @@
LIBNAME_COMMON = libmp3.a
SRCS_COMMON = sr1.c
+ifeq ($(TARGET_ARCH_X86),yes)
+SRCS_COMMON-$(TARGET_MMX) += decode_MMX.c
+SRCS_COMMON-$(TARGET_SSE) += dct64_sse.c
ifeq ($(TARGET_ARCH_X86_32),yes)
SRCS_COMMON += decode_i586.c
-SRCS_COMMON-$(TARGET_MMX) += decode_MMX.c dct64_MMX.c
+SRCS_COMMON-$(TARGET_MMX) += dct64_MMX.c
SRCS_COMMON-$(TARGET_3DNOW) += dct36_3dnow.c dct64_3dnow.c
SRCS_COMMON-$(TARGET_3DNOWEX) += dct36_k7.c dct64_k7.c
-SRCS_COMMON-$(TARGET_SSE) += dct64_sse.c
endif
+endif
SRCS_COMMON-$(TARGET_ALTIVEC) += dct64_altivec.c
include ../mpcommon.mak
+ifeq ($(TARGET_ARCH_X86_32),yes)
decode_i586.o: CFLAGS += -fomit-frame-pointer
+endif
%: %.c $(LIBNAME_COMMON) ../libvo/aclib.o ../mp_msg-mencoder.o ../cpudetect.o ../osdep/getch2.o
$(CC) $(CFLAGS) -o $@ $^ -ltermcap -lm
More information about the MPlayer-dev-eng
mailing list