[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, Part 4
Zuxy Meng
zuxy.meng at gmail.com
Wed May 23 17:46:10 CEST 2007
Hi,
2007/5/22, Guillaume POIRIER <poirierg at gmail.com>:
> Hi,
>
> Looks Ok to me.
> (I tested that it doesn't break compilation on AMD64, so there's no
> obvious regression with regards to bringing AMD64 support ... :-) )
>
> Commit anytime
Committed. Part 4 is big...I didn't bother to decipher the assembly in
tabinit_MMX.c; instead I recorded what was produced by
make_decode_table_MMX() and used that data to initialize
mp3lib_decwins directly. Therefore tabinit_MMX.c itself isn't needed
any more.
I guess this approach causes least trouble.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/Makefile
===================================================================
--- mp3lib/Makefile ?????? 23377??
+++ mp3lib/Makefile ????????????
@@ -5,7 +5,7 @@
SRCS_COMMON = sr1.c
ifeq ($(TARGET_ARCH_X86_32),yes)
SRCS_COMMON += decode_i586.c
-SRCS_COMMON-$(TARGET_MMX) += decode_MMX.c dct64_MMX.c tabinit_MMX.c
+SRCS_COMMON-$(TARGET_MMX) += decode_MMX.c dct64_MMX.c
SRCS_COMMON-$(TARGET_3DNOW) += dct36_3dnow.c dct64_3dnow.c
SRCS_COMMON-$(TARGET_3DNOWEX) += dct36_k7.c dct64_k7.c
SRCS_COMMON-$(TARGET_SSE) += dct64_sse.c
Index: mp3lib/decode_MMX.c
===================================================================
--- mp3lib/decode_MMX.c ?????? 23378??
+++ mp3lib/decode_MMX.c ????????????
@@ -11,7 +11,6 @@
#include "mangle.h"
#define real float /* ugly - but only way */
-extern short mp3lib_decwins[];
extern void (*dct64_MMX_func)(short*, short*, real*);
static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
@@ -50,6 +49,138 @@
1060439283,
};
+static short __attribute__((aligned(8))) mp3lib_decwins[] =
+{
+ 0, 7, 54, 114, 510, 1288, 1644, 9372,
+ 18760, -9373, 1644, -1289, 510, -115, 54, -8,
+ 0, 7, 54, 114, 510, 1288, 1644, 9372,
+ 18760, -9373, 1644, -1289, 510, -115, 54, -8,
+ 0, 7, 55, 129, 500, 1379, 1490, 9834,
+ 18748, -8910, 1784, -1197, 516, -101, 52, -7,
+ 0, 7, 55, 129, 500, 1379, 1490, 9834,
+ 18748, -8910, 1784, -1197, 516, -101, 52, -7,
+ 0, 8, 56, 145, 488, 1469, 1322, 10294,
+ 18714, -8448, 1910, -1107, 520, -87, 51, -6,
+ 0, 8, 56, 145, 488, 1469, 1322, 10294,
+ 18714, -8448, 1910, -1107, 520, -87, 51, -6,
+ 0, 9, 57, 161, 474, 1559, 1141, 10751,
+ 18658, -7987, 2023, -1016, 522, -74, 49, -6,
+ 0, 9, 57, 161, 474, 1559, 1141, 10751,
+ 18658, -7987, 2023, -1016, 522, -74, 49, -6,
+ 0, 10, 57, 177, 456, 1647, 944, 11205,
+ 18579, -7528, 2123, -927, 522, -61, 48, -5,
+ 0, 10, 57, 177, 456, 1647, 944, 11205,
+ 18579, -7528, 2123, -927, 522, -61, 48, -5,
+ 0, 11, 57, 194, 435, 1733, 734, 11654,
+ 18477, -7073, 2210, -838, 519, -50, 46, -5,
+ 0, 11, 57, 194, 435, 1733, 734, 11654,
+ 18477, -7073, 2210, -838, 519, -50, 46, -5,
+ 0, 12, 57, 212, 411, 1817, 510, 12097,
+ 18354, -6621, 2285, -751, 515, -39, 44, -4,
+ 0, 12, 57, 212, 411, 1817, 510, 12097,
+ 18354, -6621, 2285, -751, 515, -39, 44, -4,
+ 0, 13, 57, 229, 384, 1899, 271, 12534,
+ 18209, -6174, 2348, -666, 508, -28, 43, -4,
+ 0, 13, 57, 229, 384, 1899, 271, 12534,
+ 18209, -6174, 2348, -666, 508, -28, 43, -4,
+ 0, 14, 56, 247, 354, 1977, 18, 12963,
+ 18043, -5733, 2398, -583, 501, -18, 41, -4,
+ 0, 14, 56, 247, 354, 1977, 18, 12963,
+ 18043, -5733, 2398, -583, 501, -18, 41, -4,
+ 0, 15, 56, 266, 320, 2052, -249, 13383,
+ 17855, -5298, 2438, -502, 491, -9, 39, -3,
+ 0, 15, 56, 266, 320, 2052, -249, 13383,
+ 17855, -5298, 2438, -502, 491, -9, 39, -3,
+ 0, 17, 54, 284, 283, 2122, -530, 13794,
+ 17648, -4870, 2466, -423, 480, -1, 37, -3,
+ 0, 17, 54, 284, 283, 2122, -530, 13794,
+ 17648, -4870, 2466, -423, 480, -1, 37, -3,
+ 0, 18, 52, 302, 243, 2188, -825, 14194,
+ 17420, -4450, 2484, -347, 468, 7, 35, -3,
+ 0, 18, 52, 302, 243, 2188, -825, 14194,
+ 17420, -4450, 2484, -347, 468, 7, 35, -3,
+ 0, 19, 50, 320, 199, 2249, -1133, 14583,
+ 17173, -4039, 2492, -274, 455, 14, 33, -2,
+ 0, 19, 50, 320, 199, 2249, -1133, 14583,
+ 17173, -4039, 2492, -274, 455, 14, 33, -2,
+ -1, 21, 48, 339, 152, 2304, -1454, 14959,
+ 16908, -3637, 2490, -204, 440, 20, 32, -2,
+ -1, 21, 48, 339, 152, 2304, -1454, 14959,
+ 16908, -3637, 2490, -204, 440, 20, 32, -2,
+ -1, 22, 45, 357, 101, 2354, -1788, 15322,
+ 16624, -3245, 2479, -137, 425, 26, 30, -2,
+ -1, 22, 45, 357, 101, 2354, -1788, 15322,
+ 16624, -3245, 2479, -137, 425, 26, 30, -2,
+ -1, 24, 41, 374, 47, 2396, -2135, 15671,
+ 16323, -2864, 2460, -72, 409, 31, 28, -2,
+ -1, 24, 41, 374, 47, 2396, -2135, 15671,
+ 16323, -2864, 2460, -72, 409, 31, 28, -2,
+ -1, 26, 37, 391, -11, 2431, -2493, 16004,
+ 16005, -2494, 2432, -12, 392, 36, 26, -2,
+ -1, 26, 37, 391, -11, 2431, -2493, 16004,
+ 16005, -2494, 2432, -12, 392, 36, 26, -2,
+ -2, -28, 31, -409, -72, -2460, -2864, -16323,
+ 15671, 2135, 2396, -47, 374, -41, 24, 1,
+ -2, -28, 31, -409, -72, -2460, -2864, -16323,
+ 15671, 2135, 2396, -47, 374, -41, 24, 1,
+ -2, -30, 26, -425, -137, -2479, -3245, -16624,
+ 15322, 1788, 2354, -101, 357, -45, 22, 1,
+ -2, -30, 26, -425, -137, -2479, -3245, -16624,
+ 15322, 1788, 2354, -101, 357, -45, 22, 1,
+ -2, -32, 20, -440, -204, -2490, -3637, -16908,
+ 14959, 1454, 2304, -152, 339, -48, 21, 1,
+ -2, -32, 20, -440, -204, -2490, -3637, -16908,
+ 14959, 1454, 2304, -152, 339, -48, 21, 1,
+ -2, -33, 14, -455, -274, -2492, -4039, -17173,
+ 14583, 1133, 2249, -199, 320, -50, 19, 0,
+ -2, -33, 14, -455, -274, -2492, -4039, -17173,
+ 14583, 1133, 2249, -199, 320, -50, 19, 0,
+ -3, -35, 7, -468, -347, -2484, -4450, -17420,
+ 14194, 825, 2188, -243, 302, -52, 18, 0,
+ -3, -35, 7, -468, -347, -2484, -4450, -17420,
+ 14194, 825, 2188, -243, 302, -52, 18, 0,
+ -3, -37, -1, -480, -423, -2466, -4870, -17648,
+ 13794, 530, 2122, -283, 284, -54, 17, 0,
+ -3, -37, -1, -480, -423, -2466, -4870, -17648,
+ 13794, 530, 2122, -283, 284, -54, 17, 0,
+ -3, -39, -9, -491, -502, -2438, -5298, -17855,
+ 13383, 249, 2052, -320, 266, -56, 15, 0,
+ -3, -39, -9, -491, -502, -2438, -5298, -17855,
+ 13383, 249, 2052, -320, 266, -56, 15, 0,
+ -4, -41, -18, -501, -583, -2398, -5733, -18043,
+ 12963, -18, 1977, -354, 247, -56, 14, 0,
+ -4, -41, -18, -501, -583, -2398, -5733, -18043,
+ 12963, -18, 1977, -354, 247, -56, 14, 0,
+ -4, -43, -28, -508, -666, -2348, -6174, -18209,
+ 12534, -271, 1899, -384, 229, -57, 13, 0,
+ -4, -43, -28, -508, -666, -2348, -6174, -18209,
+ 12534, -271, 1899, -384, 229, -57, 13, 0,
+ -4, -44, -39, -515, -751, -2285, -6621, -18354,
+ 12097, -510, 1817, -411, 212, -57, 12, 0,
+ -4, -44, -39, -515, -751, -2285, -6621, -18354,
+ 12097, -510, 1817, -411, 212, -57, 12, 0,
+ -5, -46, -50, -519, -838, -2210, -7073, -18477,
+ 11654, -734, 1733, -435, 194, -57, 11, 0,
+ -5, -46, -50, -519, -838, -2210, -7073, -18477,
+ 11654, -734, 1733, -435, 194, -57, 11, 0,
+ -5, -48, -61, -522, -927, -2123, -7528, -18579,
+ 11205, -944, 1647, -456, 177, -57, 10, 0,
+ -5, -48, -61, -522, -927, -2123, -7528, -18579,
+ 11205, -944, 1647, -456, 177, -57, 10, 0,
+ -6, -49, -74, -522, -1016, -2023, -7987, -18658,
+ 10751, -1141, 1559, -474, 161, -57, 9, 0,
+ -6, -49, -74, -522, -1016, -2023, -7987, -18658,
+ 10751, -1141, 1559, -474, 161, -57, 9, 0,
+ -6, -51, -87, -520, -1107, -1910, -8448, -18714,
+ 10294, -1322, 1469, -488, 145, -56, 8, 0,
+ -6, -51, -87, -520, -1107, -1910, -8448, -18714,
+ 10294, -1322, 1469, -488, 145, -56, 8, 0,
+ -7, -52, -101, -516, -1197, -1784, -8910, -18748,
+ 9834, -1490, 1379, -500, 129, -55, 7, 0,
+ -7, -52, -101, -516, -1197, -1784, -8910, -18748,
+ 9834, -1490, 1379, -500, 129, -55, 7, 0,
+};
+
int synth_1to1_MMX(real *bandPtr, int channel, short *samples)
{
static short buffs[2][2][0x110] __attribute__((aligned(8)));
Index: mp3lib/tabinit_MMX.c
===================================================================
--- mp3lib/tabinit_MMX.c ?????? 23377??
+++ mp3lib/tabinit_MMX.c ????????????
@@ -1,160 +0,0 @@
-/*
- * This code was taken from http://www.mpg123.org
- * See ChangeLog of mpg123-0.59s-pre.1 for detail
- * Applied to mplayer by Nick Kurshev <nickols_k at mail.ru>
-*/
-#include "config.h"
-#include "mangle.h"
-
-long __attribute__((aligned(8))) mp3lib_decwins [544];
-
-#define real float
-extern real mp3lib_decwin[(512+32)];
-// static long decwin [544];
-
-static short attribute_used intwinbase_MMX[] =
-{
- 0, -1, -1, -1, -1, -1, -1, -2,
- -2, -2, -2, -3, -3, -4, -4, -5,
- -5, -6, -7, -7, -8, -9, -10, -11,
- -13, -14, -16, -17, -19, -21, -24, -26,
- -29, -31, -35, -38, -41, -45, -49, -53,
- -58, -63, -68, -73, -79, -85, -91, -97,
- -104, -111, -117, -125, -132, -139, -147, -154,
- -161, -169, -176, -183, -190, -196, -202, -208,
- -213, -218, -222, -225, -227, -228, -228, -227,
- -224, -221, -215, -208, -200, -189, -177, -163,
- -146, -127, -106, -83, -57, -29, 2, 36,
- 72, 111, 153, 197, 244, 294, 347, 401,
- 459, 519, 581, 645, 711, 779, 848, 919,
- 991, 1064, 1137, 1210, 1283, 1356, 1428, 1498,
- 1567, 1634, 1698, 1759, 1817, 1870, 1919, 1962,
- 2001, 2032, 2057, 2075, 2085, 2087, 2080, 2063,
- 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535,
- 1414, 1280, 1131, 970, 794, 605, 402, 185,
- -45, -288, -545, -814, -1095, -1388, -1692, -2006,
- -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788,
- -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597,
- -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585,
- -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750,
- -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134,
- -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082,
- -70, 998, 2122, 3300, 4533, 5818, 7154, 8540,
- 9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189,
- 22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360,
- -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863,
- -8147, -6466, -4822, -3222, -1667, -162, 1289, 2684,
- 4019, 5290, 6494, 7629, 8692, 9679, 10590, 11420,
- 12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992,
- 15038
-};
-
-static long attribute_used intwindiv = 0x47800000;
-
-void make_decode_tables_MMX(long scaleval)
-{
- long intwinbase_step;
- intwinbase_step=2;
- scaleval =- scaleval;
- __asm __volatile(
- "xorl %%ecx,%%ecx\n\t"
- "xorl %%ebx,%%ebx\n\t"
- "movl $32,%%esi\n\t"
- "movl %0,%%edi\n\t"
-".L00:\n\t"
- "cmpl $528,%%ecx\n\t"
- "jnc .L02\n\t"
- "movswl (%%edi),%%eax\n\t"
- "cmpl %0+444,%%edi\n\t"
- "jc .L01\n\t"
- "addl $60000,%%eax\n\t"
-".L01:\n\t"
- "pushl %%eax\n\t"
- "fildl (%%esp)\n\t"
- "fdivs "MANGLE(intwindiv)"\n\t"
- "popl %%eax\n\t"
- "fimull %1\n\t"
- "fsts "MANGLE(mp3lib_decwin)"(,%%ecx,4)\n\t"
- "fstps "MANGLE(mp3lib_decwin)"+64(,%%ecx,4)\n\t"
-".L02:\n\t"
- "leal -1(%%esi),%%edx\n\t"
- "and %%ebx,%%edx\n\t"
- "cmp $31,%%edx\n\t"
- "jnz .L03\n\t"
- "addl $-1023,%%ecx\n\t"
- "test %%esi,%%ebx\n\t"
- "jz .L03\n\t"
- "negl %1\n\t"
-".L03:\n\t"
- "addl %%esi,%%ecx\n\t"
- "addl %2,%%edi\n\t"
- "incl %%ebx\n\t"
- "cmpl %0,%%edi\n\t"
- "jz .L04\n\t"
- "cmp $256,%%ebx\n\t"
- "jnz .L00\n\t"
- "negl %2\n\t"
- "jmp .L00\n\t"
-".L04:\n\t"
- ::"g"(intwinbase_MMX),"m"(scaleval),"m"(intwinbase_step)
- :"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");
-intwinbase_step=2;
- __asm __volatile(
- "xorl %%ecx,%%ecx\n\t"
- "xorl %%ebx,%%ebx\n\t"
-".L05:\n\t"
- "cmpl $528,%%ecx\n\t"
- "jnc .L11\n\t"
- "movswl (%%edi),%%eax\n\t"
- "cmpl %0+444,%%edi\n\t"
- "jc .L06\n\t"
- "addl $60000,%%eax\n\t"
-".L06:\n\t"
- "cltd\n\t"
- "imull %1\n\t"
- "shrdl $17,%%edx,%%eax\n\t"
- "cmpl $32767,%%eax\n\t"
- "movl $1055,%%edx\n\t"
- "jle .L07\n\t"
- "movl $32767,%%eax\n\t"
- "jmp .L08\n\t"
-".L07:\n\t"
- "cmpl $-32767,%%eax\n\t"
- "jge .L08\n\t"
- "movl $-32767,%%eax\n\t"
-".L08:\n\t"
- "cmpl $512,%%ecx\n\t"
- "jnc .L09\n\t"
- "subl %%ecx,%%edx\n\t"
- "movw %%ax,"MANGLE(mp3lib_decwins)"(,%%edx,2)\n\t"
- "movw %%ax,"MANGLE(mp3lib_decwins)"-32(,%%edx,2)\n\t"
-".L09:\n\t"
- "testl $1,%%ecx\n\t"
- "jnz .L10\n\t"
- "negl %%eax\n\t"
-".L10:\n\t"
- "movw %%ax,"MANGLE(mp3lib_decwins)"(,%%ecx,2)\n\t"
- "movw %%ax,"MANGLE(mp3lib_decwins)"+32(,%%ecx,2)\n\t"
-".L11:\n\t"
- "leal -1(%%esi),%%edx\n\t"
- "and %%ebx,%%edx\n\t"
- "cmp $31,%%edx\n\t"
- "jnz .L12\n\t"
- "addl $-1023,%%ecx\n\t"
- "test %%esi,%%ebx\n\t"
- "jz .L12\n\t"
- "negl %1\n\t"
-".L12:\n\t"
- "addl %%esi,%%ecx\n\t"
- "addl %2,%%edi\n\t"
- "incl %%ebx\n\t"
- "cmpl %0,%%edi\n\t"
- "jz .L13\n\t"
- "cmp $256,%%ebx\n\t"
- "jnz .L05\n\t"
- "negl %2\n\t"
- "jmp .L05\n\t"
-".L13:\n\t"
- ::"g"(intwinbase_MMX),"m"(scaleval),"m"(intwinbase_step)
- :"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");
-}
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c ?????? 23377??
+++ mp3lib/sr1.c ????????????
@@ -421,8 +421,6 @@
if (gCpuCaps.hasMMX)
{
_has_mmx = 1;
- make_decode_tables_MMX(outscale);
- mp_msg(MSGT_DECAUDIO,MSGL_V,"mp3lib: made decode tables with MMX optimization\n");
synth_func = synth_1to1_MMX;
}
#endif
More information about the MPlayer-dev-eng
mailing list