[MPlayer-dev-eng] [PATCH] Rewrite synth_1to1_MMX, paving the way toward x86-64
Zuxy Meng
zuxy.meng at gmail.com
Thu May 17 14:48:48 CEST 2007
Hi,
The original version of synth_1to1_MMX hardcoded 32-bit pointers and
x86 ABI, making it difficult to port to x86-64. I rewrote the general
code in C and removed hardcoded 32-bit registers in the remaining MMX
code (therefore obsoleting my previous patch to align the stack
manually). The result is not only easier to port but also faster:
(Tested with rdtsc on a Dothan)
before: 1019 cycles
after: 976 cycles
Of course the decoded result is identical.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/decode_MMX.c
===================================================================
--- mp3lib/decode_MMX.c ?????? 23328??
+++ mp3lib/decode_MMX.c ????????????
@@ -11,6 +11,8 @@
#include "mangle.h"
#define real float /* ugly - but only way */
+extern short mp3lib_decwins[1088];
+extern void (*dct64_MMX_func)(short*, short*, real*);
static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
unsigned long __attribute__((aligned(16))) costab_mmx[] =
@@ -48,67 +50,56 @@
1060439283,
};
-static int temp; // buggy gcc 3.x fails if this is moved into the function :(
-void synth_1to1_MMX_s(real *bandPtr, int channel, short *samples,
- short *buffs, int *bo)
+int synth_1to1_MMX(real *bandPtr, int channel, short *samples)
{
+ static short buffs[2][2][0x110] __attribute__((aligned(8)));
+ static int bo = 1;
+ short *b0, (*buf)[0x110], *a, *b;
+ short* window;
+ int bo1, i = 8;
+ if (channel == 0) {
+ bo = (bo - 1) & 0xf;
+ buf = buffs[1];
+ } else {
+ samples++;
+ buf = buffs[0];
+ }
+
+ if (bo & 1) {
+ b0 = buf[1];
+ bo1 = bo + 1;
+ a = buf[0] + bo;
+ b = buf[1] + ((bo + 1) & 0xf);
+ } else {
+ b0 = buf[0];
+ bo1 = bo;
+ b = buf[0] + bo;
+ a = buf[1] + ((bo + 1) & 0xf);
+ }
+
+ dct64_MMX_func(a, b, bandPtr);
+ window = mp3lib_decwins + 16 - bo1;
+ //printf("DEBUG: channel %d, bo %d, off %d\n", channel, bo, 16 - bo1);
__asm __volatile(
- "movl %1,%%ecx\n\t"
- "movl %2,%%edi\n\t"
- "movl $15,%%ebx\n\t"
- "movl %4,%%edx\n\t"
- "leal (%%edi,%%ecx,2),%%edi\n\t"
- "decl %%ecx\n\t"
- "movl %3,%%esi\n\t"
- "movl (%%edx),%%eax\n\t"
- "jecxz .L01\n\t"
- "decl %%eax\n\t"
- "andl %%ebx,%%eax\n\t"
- "leal 1088(%%esi),%%esi\n\t"
- "movl %%eax,(%%edx)\n\t"
-".L01:\n\t"
- "leal (%%esi,%%eax,2),%%edx\n\t"
- "movl %%eax,%5\n\t"
- "incl %%eax\n\t"
- "andl %%ebx,%%eax\n\t"
- "leal 544(%%esi,%%eax,2),%%ecx\n\t"
- "incl %%ebx\n\t"
- "testl $1, %%eax\n\t"
- "jnz .L02\n\t"
- "xchgl %%edx,%%ecx\n\t"
- "incl %5\n\t"
- "leal 544(%%esi),%%esi\n\t"
-".L02:\n\t"
- "emms\n\t"
- "pushl %0\n\t"
- "pushl %%edx\n\t"
- "pushl %%ecx\n\t"
- "call *"MANGLE(dct64_MMX_func)"\n\t"
- "addl $12, %%esp\n\t"
- "leal 1(%%ebx), %%ecx\n\t"
- "subl %5,%%ebx\n\t"
- "pushl %%ecx\n\t"
- "leal "MANGLE(mp3lib_decwins)"(%%ebx,%%ebx,1), %%edx\n\t"
- "shrl $1, %%ecx\n\t"
ASMALIGN(4)
".L03:\n\t"
- "movq (%%edx),%%mm0\n\t"
- "movq 64(%%edx),%%mm4\n\t"
- "pmaddwd (%%esi),%%mm0\n\t"
- "pmaddwd 32(%%esi),%%mm4\n\t"
- "movq 8(%%edx),%%mm1\n\t"
- "movq 72(%%edx),%%mm5\n\t"
- "pmaddwd 8(%%esi),%%mm1\n\t"
- "pmaddwd 40(%%esi),%%mm5\n\t"
- "movq 16(%%edx),%%mm2\n\t"
- "movq 80(%%edx),%%mm6\n\t"
- "pmaddwd 16(%%esi),%%mm2\n\t"
- "pmaddwd 48(%%esi),%%mm6\n\t"
- "movq 24(%%edx),%%mm3\n\t"
- "movq 88(%%edx),%%mm7\n\t"
- "pmaddwd 24(%%esi),%%mm3\n\t"
- "pmaddwd 56(%%esi),%%mm7\n\t"
+ "movq (%1),%%mm0\n\t"
+ "movq 64(%1),%%mm4\n\t"
+ "pmaddwd (%2),%%mm0\n\t"
+ "pmaddwd 32(%2),%%mm4\n\t"
+ "movq 8(%1),%%mm1\n\t"
+ "movq 72(%1),%%mm5\n\t"
+ "pmaddwd 8(%2),%%mm1\n\t"
+ "pmaddwd 40(%2),%%mm5\n\t"
+ "movq 16(%1),%%mm2\n\t"
+ "movq 80(%1),%%mm6\n\t"
+ "pmaddwd 16(%2),%%mm2\n\t"
+ "pmaddwd 48(%2),%%mm6\n\t"
+ "movq 24(%1),%%mm3\n\t"
+ "movq 88(%1),%%mm7\n\t"
+ "pmaddwd 24(%2),%%mm3\n\t"
+ "pmaddwd 56(%2),%%mm7\n\t"
"paddd %%mm1,%%mm0\n\t"
"paddd %%mm5,%%mm4\n\t"
"paddd %%mm2,%%mm0\n\t"
@@ -126,32 +117,28 @@
"packssdw %%mm0,%%mm0\n\t"
"packssdw %%mm4,%%mm4\n\t"
- "movq (%%edi), %%mm1\n\t"
+ "movq (%3), %%mm1\n\t"
"punpckldq %%mm4, %%mm0\n\t"
"pand "MANGLE(one_null)", %%mm1\n\t"
"pand "MANGLE(null_one)", %%mm0\n\t"
"por %%mm0, %%mm1\n\t"
- "movq %%mm1,(%%edi)\n\t"
+ "movq %%mm1,(%3)\n\t"
- "leal 64(%%esi),%%esi\n\t"
- "leal 128(%%edx),%%edx\n\t"
- "leal 8(%%edi),%%edi\n\t"
+ "add $64,%2\n\t"
+ "add $128,%1\n\t"
+ "add $8,%3\n\t"
- "decl %%ecx\n\t"
+ "decl %0\n\t"
"jnz .L03\n\t"
- "popl %%ecx\n\t"
- "andl $1, %%ecx\n\t"
- "jecxz .next_loop\n\t"
-
- "movq (%%edx),%%mm0\n\t"
- "pmaddwd (%%esi),%%mm0\n\t"
- "movq 8(%%edx),%%mm1\n\t"
- "pmaddwd 8(%%esi),%%mm1\n\t"
- "movq 16(%%edx),%%mm2\n\t"
- "pmaddwd 16(%%esi),%%mm2\n\t"
- "movq 24(%%edx),%%mm3\n\t"
- "pmaddwd 24(%%esi),%%mm3\n\t"
+ "movq (%1),%%mm0\n\t"
+ "pmaddwd (%2),%%mm0\n\t"
+ "movq 8(%1),%%mm1\n\t"
+ "pmaddwd 8(%2),%%mm1\n\t"
+ "movq 16(%1),%%mm2\n\t"
+ "pmaddwd 16(%2),%%mm2\n\t"
+ "movq 24(%1),%%mm3\n\t"
+ "pmaddwd 24(%2),%%mm3\n\t"
"paddd %%mm1,%%mm0\n\t"
"paddd %%mm2,%%mm0\n\t"
"paddd %%mm3,%%mm0\n\t"
@@ -161,32 +148,30 @@
"psrad $13,%%mm0\n\t"
"packssdw %%mm0,%%mm0\n\t"
"movd %%mm0,%%eax\n\t"
- "movw %%ax, (%%edi)\n\t"
- "leal 32(%%esi),%%esi\n\t"
- "leal 64(%%edx),%%edx\n\t"
- "leal 4(%%edi),%%edi\n\t"
+ "movw %%ax, (%3)\n\t"
+ "sub $32,%2\n\t"
+ "add $64,%1\n\t"
+ "add $4,%3\n\t"
-".next_loop:\n\t"
- "subl $64,%%esi\n\t"
- "movl $7,%%ecx\n\t"
+ "movl $7,%0\n\t"
ASMALIGN(4)
".L04:\n\t"
- "movq (%%edx),%%mm0\n\t"
- "movq 64(%%edx),%%mm4\n\t"
- "pmaddwd (%%esi),%%mm0\n\t"
- "pmaddwd -32(%%esi),%%mm4\n\t"
- "movq 8(%%edx),%%mm1\n\t"
- "movq 72(%%edx),%%mm5\n\t"
- "pmaddwd 8(%%esi),%%mm1\n\t"
- "pmaddwd -24(%%esi),%%mm5\n\t"
- "movq 16(%%edx),%%mm2\n\t"
- "movq 80(%%edx),%%mm6\n\t"
- "pmaddwd 16(%%esi),%%mm2\n\t"
- "pmaddwd -16(%%esi),%%mm6\n\t"
- "movq 24(%%edx),%%mm3\n\t"
- "movq 88(%%edx),%%mm7\n\t"
- "pmaddwd 24(%%esi),%%mm3\n\t"
- "pmaddwd -8(%%esi),%%mm7\n\t"
+ "movq (%1),%%mm0\n\t"
+ "movq 64(%1),%%mm4\n\t"
+ "pmaddwd (%2),%%mm0\n\t"
+ "pmaddwd -32(%2),%%mm4\n\t"
+ "movq 8(%1),%%mm1\n\t"
+ "movq 72(%1),%%mm5\n\t"
+ "pmaddwd 8(%2),%%mm1\n\t"
+ "pmaddwd -24(%2),%%mm5\n\t"
+ "movq 16(%1),%%mm2\n\t"
+ "movq 80(%1),%%mm6\n\t"
+ "pmaddwd 16(%2),%%mm2\n\t"
+ "pmaddwd -16(%2),%%mm6\n\t"
+ "movq 24(%1),%%mm3\n\t"
+ "movq 88(%1),%%mm7\n\t"
+ "pmaddwd 24(%2),%%mm3\n\t"
+ "pmaddwd -8(%2),%%mm7\n\t"
"paddd %%mm1,%%mm0\n\t"
"paddd %%mm5,%%mm4\n\t"
"paddd %%mm2,%%mm0\n\t"
@@ -208,27 +193,27 @@
"psubsw %%mm1,%%mm0\n\t"
"psubsw %%mm5,%%mm4\n\t"
- "movq (%%edi), %%mm1\n\t"
+ "movq (%3), %%mm1\n\t"
"punpckldq %%mm4, %%mm0\n\t"
"pand "MANGLE(one_null)", %%mm1\n\t"
"pand "MANGLE(null_one)", %%mm0\n\t"
"por %%mm0, %%mm1\n\t"
- "movq %%mm1,(%%edi)\n\t"
+ "movq %%mm1,(%3)\n\t"
- "subl $64,%%esi\n\t"
- "addl $128,%%edx\n\t"
- "leal 8(%%edi),%%edi\n\t"
- "decl %%ecx\n\t"
+ "sub $64,%2\n\t"
+ "add $128,%1\n\t"
+ "add $8,%3\n\t"
+ "decl %0\n\t"
"jnz .L04\n\t"
- "movq (%%edx),%%mm0\n\t"
- "pmaddwd (%%esi),%%mm0\n\t"
- "movq 8(%%edx),%%mm1\n\t"
- "pmaddwd 8(%%esi),%%mm1\n\t"
- "movq 16(%%edx),%%mm2\n\t"
- "pmaddwd 16(%%esi),%%mm2\n\t"
- "movq 24(%%edx),%%mm3\n\t"
- "pmaddwd 24(%%esi),%%mm3\n\t"
+ "movq (%1),%%mm0\n\t"
+ "pmaddwd (%2),%%mm0\n\t"
+ "movq 8(%1),%%mm1\n\t"
+ "pmaddwd 8(%2),%%mm1\n\t"
+ "movq 16(%1),%%mm2\n\t"
+ "pmaddwd 16(%2),%%mm2\n\t"
+ "movq 24(%1),%%mm3\n\t"
+ "pmaddwd 24(%2),%%mm3\n\t"
"paddd %%mm1,%%mm0\n\t"
"paddd %%mm2,%%mm0\n\t"
"paddd %%mm3,%%mm0\n\t"
@@ -240,9 +225,11 @@
"psubd %%mm0,%%mm0\n\t"
"psubsw %%mm1,%%mm0\n\t"
"movd %%mm0,%%eax\n\t"
- "movw %%ax,(%%edi)\n\t"
+ "movw %%ax,(%3)\n\t"
"emms\n\t"
- :
- :"m"(bandPtr),"m"(channel),"m"(samples),"m"(buffs),"m"(bo), "m"(temp)
- :"memory","%edi","%esi","%eax","%ebx","%ecx","%edx","%esp");
+ :"+r"(i), "+r"(window), "+r"(b0), "+r"(samples)
+ :
+ :"memory", "%eax");
+ return 0;
}
+
Index: mp3lib/decod386.c
===================================================================
--- mp3lib/decod386.c ?????? 23328??
+++ mp3lib/decod386.c ????????????
@@ -124,13 +124,7 @@
static synth_func_t synth_func;
#if defined(CAN_COMPILE_X86_ASM) && defined(HAVE_MMX)
-int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
-{
- static short buffs[2][2][0x110];
- static int bo = 1;
- synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
- return 0;
-}
+extern int synth_1to1_MMX( real *bandPtr,int channel,short * samples);
#endif
#ifdef HAVE_ALTIVEC
More information about the MPlayer-dev-eng
mailing list