[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, Part 4

Zuxy Meng zuxy.meng at gmail.com
Wed May 23 17:46:10 CEST 2007


Hi,

2007/5/22, Guillaume POIRIER <poirierg at gmail.com>:
> Hi,
>
> Looks Ok to me.
> (I tested that it doesn't break compilation on AMD64, so there's no
> obvious regression with regards to bringing AMD64 support ... :-) )
>
> Commit anytime

Committed. Part 4 is big...I didn't bother to decipher the assembly in
tabinit_MMX.c; instead I recorded what was produced by
make_decode_table_MMX() and used that data to initialize
mp3lib_decwins directly. Therefore tabinit_MMX.c itself isn't needed
any more.

I guess this approach causes least trouble.
-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/Makefile
===================================================================
--- mp3lib/Makefile	?????? 23377??
+++ mp3lib/Makefile	????????????
@@ -5,7 +5,7 @@
 SRCS_COMMON = sr1.c
 ifeq ($(TARGET_ARCH_X86_32),yes)
 SRCS_COMMON                   += decode_i586.c
-SRCS_COMMON-$(TARGET_MMX)     += decode_MMX.c dct64_MMX.c tabinit_MMX.c
+SRCS_COMMON-$(TARGET_MMX)     += decode_MMX.c dct64_MMX.c
 SRCS_COMMON-$(TARGET_3DNOW)   += dct36_3dnow.c dct64_3dnow.c
 SRCS_COMMON-$(TARGET_3DNOWEX) += dct36_k7.c dct64_k7.c
 SRCS_COMMON-$(TARGET_SSE)     += dct64_sse.c
Index: mp3lib/decode_MMX.c
===================================================================
--- mp3lib/decode_MMX.c	?????? 23378??
+++ mp3lib/decode_MMX.c	????????????
@@ -11,7 +11,6 @@
 #include "mangle.h"
 #define real float /* ugly - but only way */
 
-extern short mp3lib_decwins[];
 extern void (*dct64_MMX_func)(short*, short*, real*);
 static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
 static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
@@ -50,6 +49,138 @@
 	1060439283,
 };
 
+static short __attribute__((aligned(8))) mp3lib_decwins[] =
+{
+	     0,	     7,	    54,	   114,	   510,	  1288,	  1644,	  9372,
+	 18760,	 -9373,	  1644,	 -1289,	   510,	  -115,	    54,	    -8,
+	     0,	     7,	    54,	   114,	   510,	  1288,	  1644,	  9372,
+	 18760,	 -9373,	  1644,	 -1289,	   510,	  -115,	    54,	    -8,
+	     0,	     7,	    55,	   129,	   500,	  1379,	  1490,	  9834,
+	 18748,	 -8910,	  1784,	 -1197,	   516,	  -101,	    52,	    -7,
+	     0,	     7,	    55,	   129,	   500,	  1379,	  1490,	  9834,
+	 18748,	 -8910,	  1784,	 -1197,	   516,	  -101,	    52,	    -7,
+	     0,	     8,	    56,	   145,	   488,	  1469,	  1322,	 10294,
+	 18714,	 -8448,	  1910,	 -1107,	   520,	   -87,	    51,	    -6,
+	     0,	     8,	    56,	   145,	   488,	  1469,	  1322,	 10294,
+	 18714,	 -8448,	  1910,	 -1107,	   520,	   -87,	    51,	    -6,
+	     0,	     9,	    57,	   161,	   474,	  1559,	  1141,	 10751,
+	 18658,	 -7987,	  2023,	 -1016,	   522,	   -74,	    49,	    -6,
+	     0,	     9,	    57,	   161,	   474,	  1559,	  1141,	 10751,
+	 18658,	 -7987,	  2023,	 -1016,	   522,	   -74,	    49,	    -6,
+	     0,	    10,	    57,	   177,	   456,	  1647,	   944,	 11205,
+	 18579,	 -7528,	  2123,	  -927,	   522,	   -61,	    48,	    -5,
+	     0,	    10,	    57,	   177,	   456,	  1647,	   944,	 11205,
+	 18579,	 -7528,	  2123,	  -927,	   522,	   -61,	    48,	    -5,
+	     0,	    11,	    57,	   194,	   435,	  1733,	   734,	 11654,
+	 18477,	 -7073,	  2210,	  -838,	   519,	   -50,	    46,	    -5,
+	     0,	    11,	    57,	   194,	   435,	  1733,	   734,	 11654,
+	 18477,	 -7073,	  2210,	  -838,	   519,	   -50,	    46,	    -5,
+	     0,	    12,	    57,	   212,	   411,	  1817,	   510,	 12097,
+	 18354,	 -6621,	  2285,	  -751,	   515,	   -39,	    44,	    -4,
+	     0,	    12,	    57,	   212,	   411,	  1817,	   510,	 12097,
+	 18354,	 -6621,	  2285,	  -751,	   515,	   -39,	    44,	    -4,
+	     0,	    13,	    57,	   229,	   384,	  1899,	   271,	 12534,
+	 18209,	 -6174,	  2348,	  -666,	   508,	   -28,	    43,	    -4,
+	     0,	    13,	    57,	   229,	   384,	  1899,	   271,	 12534,
+	 18209,	 -6174,	  2348,	  -666,	   508,	   -28,	    43,	    -4,
+	     0,	    14,	    56,	   247,	   354,	  1977,	    18,	 12963,
+	 18043,	 -5733,	  2398,	  -583,	   501,	   -18,	    41,	    -4,
+	     0,	    14,	    56,	   247,	   354,	  1977,	    18,	 12963,
+	 18043,	 -5733,	  2398,	  -583,	   501,	   -18,	    41,	    -4,
+	     0,	    15,	    56,	   266,	   320,	  2052,	  -249,	 13383,
+	 17855,	 -5298,	  2438,	  -502,	   491,	    -9,	    39,	    -3,
+	     0,	    15,	    56,	   266,	   320,	  2052,	  -249,	 13383,
+	 17855,	 -5298,	  2438,	  -502,	   491,	    -9,	    39,	    -3,
+	     0,	    17,	    54,	   284,	   283,	  2122,	  -530,	 13794,
+	 17648,	 -4870,	  2466,	  -423,	   480,	    -1,	    37,	    -3,
+	     0,	    17,	    54,	   284,	   283,	  2122,	  -530,	 13794,
+	 17648,	 -4870,	  2466,	  -423,	   480,	    -1,	    37,	    -3,
+	     0,	    18,	    52,	   302,	   243,	  2188,	  -825,	 14194,
+	 17420,	 -4450,	  2484,	  -347,	   468,	     7,	    35,	    -3,
+	     0,	    18,	    52,	   302,	   243,	  2188,	  -825,	 14194,
+	 17420,	 -4450,	  2484,	  -347,	   468,	     7,	    35,	    -3,
+	     0,	    19,	    50,	   320,	   199,	  2249,	 -1133,	 14583,
+	 17173,	 -4039,	  2492,	  -274,	   455,	    14,	    33,	    -2,
+	     0,	    19,	    50,	   320,	   199,	  2249,	 -1133,	 14583,
+	 17173,	 -4039,	  2492,	  -274,	   455,	    14,	    33,	    -2,
+	    -1,	    21,	    48,	   339,	   152,	  2304,	 -1454,	 14959,
+	 16908,	 -3637,	  2490,	  -204,	   440,	    20,	    32,	    -2,
+	    -1,	    21,	    48,	   339,	   152,	  2304,	 -1454,	 14959,
+	 16908,	 -3637,	  2490,	  -204,	   440,	    20,	    32,	    -2,
+	    -1,	    22,	    45,	   357,	   101,	  2354,	 -1788,	 15322,
+	 16624,	 -3245,	  2479,	  -137,	   425,	    26,	    30,	    -2,
+	    -1,	    22,	    45,	   357,	   101,	  2354,	 -1788,	 15322,
+	 16624,	 -3245,	  2479,	  -137,	   425,	    26,	    30,	    -2,
+	    -1,	    24,	    41,	   374,	    47,	  2396,	 -2135,	 15671,
+	 16323,	 -2864,	  2460,	   -72,	   409,	    31,	    28,	    -2,
+	    -1,	    24,	    41,	   374,	    47,	  2396,	 -2135,	 15671,
+	 16323,	 -2864,	  2460,	   -72,	   409,	    31,	    28,	    -2,
+	    -1,	    26,	    37,	   391,	   -11,	  2431,	 -2493,	 16004,
+	 16005,	 -2494,	  2432,	   -12,	   392,	    36,	    26,	    -2,
+	    -1,	    26,	    37,	   391,	   -11,	  2431,	 -2493,	 16004,
+	 16005,	 -2494,	  2432,	   -12,	   392,	    36,	    26,	    -2,
+	    -2,	   -28,	    31,	  -409,	   -72,	 -2460,	 -2864,	-16323,
+	 15671,	  2135,	  2396,	   -47,	   374,	   -41,	    24,	     1,
+	    -2,	   -28,	    31,	  -409,	   -72,	 -2460,	 -2864,	-16323,
+	 15671,	  2135,	  2396,	   -47,	   374,	   -41,	    24,	     1,
+	    -2,	   -30,	    26,	  -425,	  -137,	 -2479,	 -3245,	-16624,
+	 15322,	  1788,	  2354,	  -101,	   357,	   -45,	    22,	     1,
+	    -2,	   -30,	    26,	  -425,	  -137,	 -2479,	 -3245,	-16624,
+	 15322,	  1788,	  2354,	  -101,	   357,	   -45,	    22,	     1,
+	    -2,	   -32,	    20,	  -440,	  -204,	 -2490,	 -3637,	-16908,
+	 14959,	  1454,	  2304,	  -152,	   339,	   -48,	    21,	     1,
+	    -2,	   -32,	    20,	  -440,	  -204,	 -2490,	 -3637,	-16908,
+	 14959,	  1454,	  2304,	  -152,	   339,	   -48,	    21,	     1,
+	    -2,	   -33,	    14,	  -455,	  -274,	 -2492,	 -4039,	-17173,
+	 14583,	  1133,	  2249,	  -199,	   320,	   -50,	    19,	     0,
+	    -2,	   -33,	    14,	  -455,	  -274,	 -2492,	 -4039,	-17173,
+	 14583,	  1133,	  2249,	  -199,	   320,	   -50,	    19,	     0,
+	    -3,	   -35,	     7,	  -468,	  -347,	 -2484,	 -4450,	-17420,
+	 14194,	   825,	  2188,	  -243,	   302,	   -52,	    18,	     0,
+	    -3,	   -35,	     7,	  -468,	  -347,	 -2484,	 -4450,	-17420,
+	 14194,	   825,	  2188,	  -243,	   302,	   -52,	    18,	     0,
+	    -3,	   -37,	    -1,	  -480,	  -423,	 -2466,	 -4870,	-17648,
+	 13794,	   530,	  2122,	  -283,	   284,	   -54,	    17,	     0,
+	    -3,	   -37,	    -1,	  -480,	  -423,	 -2466,	 -4870,	-17648,
+	 13794,	   530,	  2122,	  -283,	   284,	   -54,	    17,	     0,
+	    -3,	   -39,	    -9,	  -491,	  -502,	 -2438,	 -5298,	-17855,
+	 13383,	   249,	  2052,	  -320,	   266,	   -56,	    15,	     0,
+	    -3,	   -39,	    -9,	  -491,	  -502,	 -2438,	 -5298,	-17855,
+	 13383,	   249,	  2052,	  -320,	   266,	   -56,	    15,	     0,
+	    -4,	   -41,	   -18,	  -501,	  -583,	 -2398,	 -5733,	-18043,
+	 12963,	   -18,	  1977,	  -354,	   247,	   -56,	    14,	     0,
+	    -4,	   -41,	   -18,	  -501,	  -583,	 -2398,	 -5733,	-18043,
+	 12963,	   -18,	  1977,	  -354,	   247,	   -56,	    14,	     0,
+	    -4,	   -43,	   -28,	  -508,	  -666,	 -2348,	 -6174,	-18209,
+	 12534,	  -271,	  1899,	  -384,	   229,	   -57,	    13,	     0,
+	    -4,	   -43,	   -28,	  -508,	  -666,	 -2348,	 -6174,	-18209,
+	 12534,	  -271,	  1899,	  -384,	   229,	   -57,	    13,	     0,
+	    -4,	   -44,	   -39,	  -515,	  -751,	 -2285,	 -6621,	-18354,
+	 12097,	  -510,	  1817,	  -411,	   212,	   -57,	    12,	     0,
+	    -4,	   -44,	   -39,	  -515,	  -751,	 -2285,	 -6621,	-18354,
+	 12097,	  -510,	  1817,	  -411,	   212,	   -57,	    12,	     0,
+	    -5,	   -46,	   -50,	  -519,	  -838,	 -2210,	 -7073,	-18477,
+	 11654,	  -734,	  1733,	  -435,	   194,	   -57,	    11,	     0,
+	    -5,	   -46,	   -50,	  -519,	  -838,	 -2210,	 -7073,	-18477,
+	 11654,	  -734,	  1733,	  -435,	   194,	   -57,	    11,	     0,
+	    -5,	   -48,	   -61,	  -522,	  -927,	 -2123,	 -7528,	-18579,
+	 11205,	  -944,	  1647,	  -456,	   177,	   -57,	    10,	     0,
+	    -5,	   -48,	   -61,	  -522,	  -927,	 -2123,	 -7528,	-18579,
+	 11205,	  -944,	  1647,	  -456,	   177,	   -57,	    10,	     0,
+	    -6,	   -49,	   -74,	  -522,	 -1016,	 -2023,	 -7987,	-18658,
+	 10751,	 -1141,	  1559,	  -474,	   161,	   -57,	     9,	     0,
+	    -6,	   -49,	   -74,	  -522,	 -1016,	 -2023,	 -7987,	-18658,
+	 10751,	 -1141,	  1559,	  -474,	   161,	   -57,	     9,	     0,
+	    -6,	   -51,	   -87,	  -520,	 -1107,	 -1910,	 -8448,	-18714,
+	 10294,	 -1322,	  1469,	  -488,	   145,	   -56,	     8,	     0,
+	    -6,	   -51,	   -87,	  -520,	 -1107,	 -1910,	 -8448,	-18714,
+	 10294,	 -1322,	  1469,	  -488,	   145,	   -56,	     8,	     0,
+	    -7,	   -52,	  -101,	  -516,	 -1197,	 -1784,	 -8910,	-18748,
+	  9834,	 -1490,	  1379,	  -500,	   129,	   -55,	     7,	     0,
+	    -7,	   -52,	  -101,	  -516,	 -1197,	 -1784,	 -8910,	-18748,
+	  9834,	 -1490,	  1379,	  -500,	   129,	   -55,	     7,	     0,
+};
+
 int synth_1to1_MMX(real *bandPtr, int channel, short *samples)
 {
     static short buffs[2][2][0x110] __attribute__((aligned(8)));
Index: mp3lib/tabinit_MMX.c
===================================================================
--- mp3lib/tabinit_MMX.c	?????? 23377??
+++ mp3lib/tabinit_MMX.c	????????????
@@ -1,160 +0,0 @@
-/*
- * This code was taken from http://www.mpg123.org
- * See ChangeLog of mpg123-0.59s-pre.1 for detail
- * Applied to mplayer by Nick Kurshev <nickols_k at mail.ru>
-*/
-#include "config.h"
-#include "mangle.h"
-
-long __attribute__((aligned(8))) mp3lib_decwins [544];
-
-#define real float
-extern real mp3lib_decwin[(512+32)];
-// static long decwin [544];
-
-static short attribute_used intwinbase_MMX[] = 
-{
-	      0,    -1,    -1,    -1,    -1,    -1,    -1,    -2,
-	     -2,    -2,    -2,    -3,    -3,    -4,    -4,    -5,
-	     -5,    -6,    -7,    -7,    -8,    -9,   -10,   -11,
-	    -13,   -14,   -16,   -17,   -19,   -21,   -24,   -26,
-	    -29,   -31,   -35,   -38,   -41,   -45,   -49,   -53,
-	    -58,   -63,   -68,   -73,   -79,   -85,   -91,   -97,
-	   -104,  -111,  -117,  -125,  -132,  -139,  -147,  -154,
-	   -161,  -169,  -176,  -183,  -190,  -196,  -202,  -208,
-	   -213,  -218,  -222,  -225,  -227,  -228,  -228,  -227,
-	   -224,  -221,  -215,  -208,  -200,  -189,  -177,  -163,
-	   -146,  -127,  -106,   -83,   -57,   -29,     2,    36,
-	     72,   111,   153,   197,   244,   294,   347,   401,
-	    459,   519,   581,   645,   711,   779,   848,   919,
-	    991,  1064,  1137,  1210,  1283,  1356,  1428,  1498,
-	   1567,  1634,  1698,  1759,  1817,  1870,  1919,  1962,
-	   2001,  2032,  2057,  2075,  2085,  2087,  2080,  2063,
-	   2037,  2000,  1952,  1893,  1822,  1739,  1644,  1535,
-	   1414,  1280,  1131,   970,   794,   605,   402,   185,
-	    -45,  -288,  -545,  -814, -1095, -1388, -1692, -2006,
-	  -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788,
-	  -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597,
-	  -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585,
-	  -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750,
-	  -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134,
-	  -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082,
-	    -70,   998,  2122,  3300,  4533,  5818,  7154,  8540,
-	   9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189,
-	  22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360,
-	 -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863,
-	  -8147, -6466, -4822, -3222, -1667,  -162,  1289,  2684,
-	   4019,  5290,  6494,  7629,  8692,  9679, 10590, 11420,
-	  12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992,
-	  15038
-};
-
-static long attribute_used intwindiv = 0x47800000;
-
-void make_decode_tables_MMX(long scaleval)
-{
-  long intwinbase_step;
-  intwinbase_step=2;
-  scaleval =- scaleval;
-    __asm __volatile(
-	"xorl %%ecx,%%ecx\n\t"
-	"xorl %%ebx,%%ebx\n\t"
-	"movl $32,%%esi\n\t"
-	"movl %0,%%edi\n\t"
-".L00:\n\t"
-	"cmpl $528,%%ecx\n\t"
-	"jnc .L02\n\t"
-	"movswl (%%edi),%%eax\n\t"
-	"cmpl %0+444,%%edi\n\t"
-	"jc .L01\n\t"
-	"addl $60000,%%eax\n\t"
-".L01:\n\t"
-	"pushl %%eax\n\t"
-	"fildl (%%esp)\n\t"
-	"fdivs "MANGLE(intwindiv)"\n\t"
-	"popl %%eax\n\t"
-	"fimull %1\n\t"
-	"fsts  "MANGLE(mp3lib_decwin)"(,%%ecx,4)\n\t"
-	"fstps "MANGLE(mp3lib_decwin)"+64(,%%ecx,4)\n\t"
-".L02:\n\t"
-	"leal -1(%%esi),%%edx\n\t"
-	"and %%ebx,%%edx\n\t"
-	"cmp $31,%%edx\n\t"
-	"jnz .L03\n\t"
-	"addl $-1023,%%ecx\n\t"
-	"test %%esi,%%ebx\n\t"
-	"jz  .L03\n\t"
-	"negl %1\n\t"
-".L03:\n\t"
-	"addl %%esi,%%ecx\n\t"
-	"addl %2,%%edi\n\t"
-	"incl %%ebx\n\t"
-	"cmpl %0,%%edi\n\t"
-	"jz .L04\n\t"
-	"cmp $256,%%ebx\n\t"
-	"jnz .L00\n\t"
-	"negl %2\n\t"
-	"jmp .L00\n\t"
-".L04:\n\t"
-	::"g"(intwinbase_MMX),"m"(scaleval),"m"(intwinbase_step)
-	:"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");
-intwinbase_step=2;
-  __asm __volatile(
-	"xorl %%ecx,%%ecx\n\t"
-	"xorl %%ebx,%%ebx\n\t"
-".L05:\n\t"
-	"cmpl $528,%%ecx\n\t"
-	"jnc .L11\n\t"
-	"movswl (%%edi),%%eax\n\t"
-	"cmpl %0+444,%%edi\n\t"
-	"jc .L06\n\t"
-	"addl $60000,%%eax\n\t"
-".L06:\n\t"
-	"cltd\n\t"
-	"imull %1\n\t"
-	"shrdl $17,%%edx,%%eax\n\t"
-	"cmpl $32767,%%eax\n\t"
-	"movl $1055,%%edx\n\t"
-	"jle .L07\n\t"
-	"movl $32767,%%eax\n\t"
-	"jmp .L08\n\t"
-".L07:\n\t"
-	"cmpl $-32767,%%eax\n\t"
-	"jge .L08\n\t"
-	"movl $-32767,%%eax\n\t"
-".L08:\n\t"
-	"cmpl $512,%%ecx\n\t"
-	"jnc .L09\n\t"
-	"subl %%ecx,%%edx\n\t"
-	"movw %%ax,"MANGLE(mp3lib_decwins)"(,%%edx,2)\n\t"
-	"movw %%ax,"MANGLE(mp3lib_decwins)"-32(,%%edx,2)\n\t"
-".L09:\n\t"
-	"testl $1,%%ecx\n\t"
-	"jnz .L10\n\t"
-	"negl %%eax\n\t"
-".L10:\n\t"
-	"movw %%ax,"MANGLE(mp3lib_decwins)"(,%%ecx,2)\n\t"
-	"movw %%ax,"MANGLE(mp3lib_decwins)"+32(,%%ecx,2)\n\t"
-".L11:\n\t"
-	"leal -1(%%esi),%%edx\n\t"
-	"and %%ebx,%%edx\n\t"
-	"cmp $31,%%edx\n\t"
-	"jnz .L12\n\t"
-	"addl $-1023,%%ecx\n\t"
-	"test %%esi,%%ebx\n\t"
-	"jz  .L12\n\t"
-	"negl %1\n\t"
-".L12:\n\t"
-	"addl %%esi,%%ecx\n\t"
-	"addl %2,%%edi\n\t"
-	"incl %%ebx\n\t"
-	"cmpl %0,%%edi\n\t"
-	"jz .L13\n\t"
-	"cmp $256,%%ebx\n\t"
-	"jnz .L05\n\t"
-	"negl %2\n\t"
-	"jmp .L05\n\t"
-".L13:\n\t"
-	::"g"(intwinbase_MMX),"m"(scaleval),"m"(intwinbase_step)
-	:"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");
-}
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c	?????? 23377??
+++ mp3lib/sr1.c	????????????
@@ -421,8 +421,6 @@
     if (gCpuCaps.hasMMX)
     {
 	_has_mmx = 1;
-	make_decode_tables_MMX(outscale);
-	mp_msg(MSGT_DECAUDIO,MSGL_V,"mp3lib: made decode tables with MMX optimization\n");
 	synth_func = synth_1to1_MMX;
     }
 #endif


More information about the MPlayer-dev-eng mailing list