[MPlayer-dev-eng] [PATCH] Make mp3lib SIMD optimizations work on AMD64, Part 5
Zuxy Meng
zuxy.meng at gmail.com
Fri May 25 08:14:49 CEST 2007
Hi,
2007/5/20, Zuxy Meng <zuxy.meng at gmail.com>:
> As discussed with Guillaume on IRC, I'll split my previous big patch
> (Rewrite synth_1to1_MMX....) into several small parts for easier
> review. Here's the first one, rewriting the generic code in
> synth_1to1_MMX from assembly to C, so we don't need to deal with
> different ABIs. I've tested it and confirmed it doesn't hurt
> performance.
>
> Note I removed a conditional jump in the remaining assembly too. By
> analyzing the code I'm sure it's never taken so don't worry about
> that. Strictly speaking it should be in a seperate patch but then this
> patch would break mplayer...
>
> Part 2 will replace 32-bit leal to equivalent add/sub (without the 'l'
> suffix) so pointer arithmetic will be 64-bit under amd64.
>
> Part 3 will remove hardcoded registers.
>
> Part 4 will kill tabinit_mmx.c. We don't need to compute the table at
> runtime; it can be predetermined.
>
> Part 5 will correct data types, replacing 'long' with 'int' where necessary.
>
> The last patch will deal with Makefile and macros.
We've reached Part 5. This is again a big diff, replacing many 'long'
to 'int', and several 'real' to 'short'. Among those, many are
necessary because under LP64 environment, sizeof(long) != sizeof(int)
and sizeof(long) != sizeof(float); others are optimizations simply
because we really don't need such a long type, or the declaration
didn't match its use (declared as float, used as short).
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: mp3lib/decode_i586.c
===================================================================
--- mp3lib/decode_i586.c ?????? 23382??
+++ mp3lib/decode_i586.c ????????????
@@ -33,9 +33,9 @@
#include "mangle.h"
#define real float /* ugly - but only way */
-static long attribute_used buffs[1088]={0};
-static long attribute_used bo=1;
-static long attribute_used saved_ebp=0;
+static int attribute_used buffs[1088]={0};
+static int attribute_used bo=1;
+static int attribute_used saved_ebp=0;
int synth_1to1_pent(real *bandPtr, int channel, short *samples)
{
Index: mp3lib/dct64_sse.c
===================================================================
--- mp3lib/dct64_sse.c ?????? 23382??
+++ mp3lib/dct64_sse.c ????????????
@@ -30,15 +18,12 @@
static const int nnnn[4] __attribute__((aligned(16))) =
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
-void dct64_sse(real *a,real *b,real *c)
+void dct64_sse(short *out0,short *out1,real *c)
{
static real __attribute__ ((aligned(16))) b1[0x20];
static real __attribute__ ((aligned(16))) b2[0x20];
static real const one = 1.f;
- short *out0 = (short*)a;
- short *out1 = (short*)b;
-
{
real *costab = costab_mmx;
int i;
@@ -428,7 +413,7 @@
"fistp 416(%4)\n\t"
".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
:
- :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(a), "r"(b)
+ :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
:"memory"
);
#endif
Index: mp3lib/dct64_3dnow.c
===================================================================
--- mp3lib/dct64_3dnow.c ?????? 23382??
+++ mp3lib/dct64_3dnow.c ????????????
@@ -15,7 +15,7 @@
static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
static float attribute_used plus_1f = 1.0;
-void dct64_MMX_3dnow(real *a,real *b,real *c)
+void dct64_MMX_3dnow(short *a,short *b,real *c)
{
char tmp[256];
__asm __volatile(
Index: mp3lib/layer3.c
===================================================================
--- mp3lib/layer3.c ?????? 23382??
+++ mp3lib/layer3.c ????????????
@@ -324,7 +324,7 @@
* read additional side information (for MPEG 1 and MPEG 2)
*/
static int III_get_side_info(struct III_sideinfo *si,int stereo,
- int ms_stereo,long sfreq,int single,int lsf)
+ int ms_stereo,int sfreq,int single,int lsf)
{
int ch, gr;
int powdiff = (single == 3) ? 4 : 0;
@@ -568,10 +568,10 @@
* Dequantize samples (includes huffman decoding)
*/
/* 24 is enough because tab13 has max. a 19 bit huffvector */
-#define BITSHIFT ((sizeof(long)-1)*8)
+#define BITSHIFT ((sizeof(int)-1)*8)
#define REFRESH_MASK \
while(num < BITSHIFT) { \
- mask |= ((unsigned long)getbyte())<<(BITSHIFT-num); \
+ mask |= ((unsigned)getbyte())<<(BITSHIFT-num); \
num += 8; \
part2remain -= 8; }
@@ -585,9 +585,9 @@
int *me;
int num=getbitoffset();
- long mask;
+ int mask;
/* we must split this, because for num==0 the shift is undefined if you do it in one step */
- mask = ((unsigned long) getbits(num))<<BITSHIFT;
+ mask = ((unsigned) getbits(num))<<BITSHIFT;
mask <<= 8-num;
part2remain -= num;
@@ -672,7 +672,7 @@
if(x == 15 && h->linbits) {
max[lwin] = cb;
REFRESH_MASK;
- x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+ x += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
num -= h->linbits+1;
mask <<= h->linbits;
if(mask < 0)
@@ -696,7 +696,7 @@
if(y == 15 && h->linbits) {
max[lwin] = cb;
REFRESH_MASK;
- y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+ y += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
num -= h->linbits+1;
mask <<= h->linbits;
if(mask < 0)
@@ -850,7 +850,7 @@
if (x == 15 && h->linbits) {
max = cb;
REFRESH_MASK;
- x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+ x += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
num -= h->linbits+1;
mask <<= h->linbits;
if(mask < 0)
@@ -874,7 +874,7 @@
if (y == 15 && h->linbits) {
max = cb;
REFRESH_MASK;
- y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits);
+ y += ((unsigned) mask) >> (BITSHIFT+8-h->linbits);
num -= h->linbits+1;
mask <<= h->linbits;
if(mask < 0)
@@ -1260,11 +1260,11 @@
granules = (fr->lsf) ? 1 : 2;
for (gr=0;gr<granules;gr++){
static real hybridIn[2][SBLIMIT][SSLIMIT];
static real hybridOut[2][SSLIMIT][SBLIMIT];
{ struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]);
- long part2bits;
+ int part2bits;
if(fr->lsf)
part2bits = III_get_scale_factors_2(scalefacs[0],gr_info,0);
else
@@ -1276,7 +1276,7 @@
if(stereo == 2) {
struct gr_info_s *gr_info = &(sideinfo.ch[1].gr[gr]);
- long part2bits;
+ int part2bits;
if(fr->lsf)
part2bits = III_get_scale_factors_2(scalefacs[1],gr_info,i_stereo);
else
Index: mp3lib/decode_MMX.c
===================================================================
--- mp3lib/decode_MMX.c ?????? 23383??
+++ mp3lib/decode_MMX.c ????????????
@@ -14,7 +14,7 @@
extern void (*dct64_MMX_func)(short*, short*, real*);
static unsigned long long attribute_used __attribute__((aligned(8))) null_one = 0x0000ffff0000ffffULL;
static unsigned long long attribute_used __attribute__((aligned(8))) one_null = 0xffff0000ffff0000ULL;
-unsigned long __attribute__((aligned(16))) costab_mmx[] =
+unsigned int __attribute__((aligned(16))) costab_mmx[] =
{
1056974725,
1057056395,
Index: mp3lib/dct64_MMX.c
===================================================================
--- mp3lib/dct64_MMX.c ?????? 23382??
+++ mp3lib/dct64_MMX.c ????????????
@@ -6,7 +6,7 @@
#include "mangle.h"
#define real float /* ugly - but only way */
-void dct64_MMX(real *a,real *b,real *c)
+void dct64_MMX(short *a,short *b,real *c)
{
char tmp[256];
__asm __volatile(
Index: mp3lib/sr1.c
===================================================================
--- mp3lib/sr1.c ?????? 23383??
+++ mp3lib/sr1.c ????????????
@@ -50,7 +50,6 @@
int MP3_channels=0;
int MP3_bps=2;
-static long outscale = 32768;
#include "tabinit.c"
#if 1
@@ -108,11 +107,11 @@
{0,8,16,24,32,40,48,56,64,80,96,112,128,144,160,} }
};
-static long freqs[9] = { 44100, 48000, 32000, 22050, 24000, 16000 , 11025 , 12000 , 8000 };
+static int freqs[9] = { 44100, 48000, 32000, 22050, 24000, 16000 , 11025 , 12000 , 8000 };
LOCAL unsigned int getbits(short number_of_bits)
{
- unsigned long rval;
+ unsigned int rval;
// if(MP3_frames>=7741) printf("getbits: bits=%d bitsleft=%d wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
if((bitsleft-=number_of_bits)<0) return 0;
if(!number_of_bits) return 0;
@@ -133,7 +132,7 @@
LOCAL unsigned int getbits_fast(short number_of_bits)
{
- unsigned long rval;
+ unsigned int rval;
// if(MP3_frames>=7741) printf("getbits_fast: bits=%d bitsleft=%d wordptr=%x\n",number_of_bits,bitsleft,wordpointer);
if((bitsleft-=number_of_bits)<0) return 0;
if(!number_of_bits) return 0;
@@ -167,7 +166,7 @@
return ((rval>>7)&1);
}
-LOCAL void set_pointer(long backstep)
+LOCAL void set_pointer(int backstep)
{
// if(backstep!=512 && backstep>fsizeold)
// printf("\rWarning! backstep (%d>%d) \n",backstep,fsizeold);
@@ -178,10 +177,10 @@
// printf("Backstep %d (bitsleft=%d)\n",backstep,bitsleft);
}
-LOCAL int stream_head_read(unsigned char *hbuf,unsigned long *newhead){
+LOCAL int stream_head_read(unsigned char *hbuf,unsigned *newhead){
if(mp3_read(hbuf,4) != 4) return FALSE;
#if defined(CAN_COMPILE_X86_ASM)
- *newhead = bswap_32(*((unsigned long *)hbuf));
+ *newhead = bswap_32(*((unsigned*)hbuf));
#else
/*
* we may not be able to address unaligned 32-bit data on non-x86 cpus.
@@ -196,8 +195,8 @@
return TRUE;
}
-LOCAL int stream_head_shift(unsigned char *hbuf,unsigned long *head){
- *((unsigned long *)hbuf) >>= 8;
+LOCAL int stream_head_shift(unsigned char *hbuf,unsigned *head){
+ *((unsigned*)hbuf) >>= 8;
if(mp3_read(hbuf+3,1) != 1) return 0;
*head <<= 8;
*head |= hbuf[3];
@@ -208,7 +207,7 @@
* decode a header and write the information
* into the frame structure
*/
-LOCAL int decode_header(struct frame *fr,unsigned long newhead){
+LOCAL int decode_header(struct frame *fr,unsigned newhead){
// head_check:
if( (newhead & 0xffe00000) != 0xffe00000 ||
@@ -217,8 +216,8 @@
fr->lay = 4-((newhead>>17)&3);
// if(fr->lay!=3) return FALSE;
- if( newhead & ((long)1<<20) ) {
- fr->lsf = (newhead & ((long)1<<19)) ? 0x0 : 0x1;
+ if( newhead & (1<<20) ) {
+ fr->lsf = (newhead & (1<<19)) ? 0x0 : 0x1;
fr->mpeg25 = 0;
} else {
fr->lsf = 1;
@@ -253,7 +252,7 @@
case 2:
MP3_bitrate=tabsel_123[fr->lsf][1][fr->bitrate_index];
MP3_samplerate=freqs[fr->sampling_frequency];
- fr->framesize = (long) MP3_bitrate * 144000;
+ fr->framesize = MP3_bitrate * 144000;
fr->framesize /= MP3_samplerate;
MP3_framesize=fr->framesize;
fr->framesize += fr->padding - 4;
@@ -267,7 +266,7 @@
MP3_bitrate=tabsel_123[fr->lsf][2][fr->bitrate_index];
MP3_samplerate=freqs[fr->sampling_frequency];
- fr->framesize = (long) MP3_bitrate * 144000;
+ fr->framesize = MP3_bitrate * 144000;
fr->framesize /= MP3_samplerate<<(fr->lsf);
MP3_framesize=fr->framesize;
fr->framesize += fr->padding - 4;
@@ -276,7 +275,7 @@
// fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : 32;
MP3_bitrate=tabsel_123[fr->lsf][0][fr->bitrate_index];
MP3_samplerate=freqs[fr->sampling_frequency];
- fr->framesize = (long) MP3_bitrate * 12000;
+ fr->framesize = MP3_bitrate * 12000;
fr->framesize /= MP3_samplerate;
MP3_framesize = ((fr->framesize+fr->padding)<<2);
fr->framesize = MP3_framesize-4;
@@ -314,10 +313,10 @@
* read next frame return number of frames read.
*/
LOCAL int read_frame(struct frame *fr){
- unsigned long newhead;
+ unsigned newhead;
union {
unsigned char buf[8];
- unsigned long dummy; // for alignment
+ unsigned dummy; // for alignment
} hbuf;
int skipped,resyncpos;
int frames=0;
@@ -393,11 +392,11 @@
/******************************************************************************/
/* It's hidden from gcc in assembler */
-extern void dct64_MMX(real *, real *, real *);
-extern void dct64_MMX_3dnow(real *, real *, real *);
-extern void dct64_MMX_3dnowex(real *, real *, real *);
-extern void dct64_sse(real *, real *, real *);
-void (*dct64_MMX_func)(real *, real *, real *);
+extern void dct64_MMX(short *, short *, real *);
+extern void dct64_MMX_3dnow(short *, short *, real *);
+extern void dct64_MMX_3dnowex(short *, short *, real *);
+extern void dct64_sse(short *, short *, real *);
+void (*dct64_MMX_func)(short *, short *, real *);
#include "cpudetect.h"
@@ -413,7 +412,7 @@
_has_mmx = 0;
dct36_func = dct36;
- make_decode_tables(outscale);
+ make_decode_tables();
#ifdef CAN_COMPILE_X86_ASM
Index: mp3lib/mpg123.h
===================================================================
--- mp3lib/mpg123.h ?????? 23382??
+++ mp3lib/mpg123.h ????????????
@@ -71,7 +71,7 @@
int lay;
int error_protection;
int bitrate_index;
- long sampling_frequency;
+ int sampling_frequency;
int padding;
int extension;
int mode;
@@ -79,7 +79,7 @@
int copyright;
int original;
int emphasis;
- long framesize; /* computed framesize */
+ int framesize; /* computed framesize */
};
@@ -117,7 +117,7 @@
extern real *mp3lib_pnts[];
extern int synth_1to1_pent( real *,int,short * );
-extern void make_decode_tables_MMX(long scaleval);
+extern void make_decode_tables_MMX(void);
extern int synth_1to1_MMX( real *,int,short * );
extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
Index: mp3lib/dct64_k7.c
===================================================================
--- mp3lib/dct64_k7.c ?????? 23382??
+++ mp3lib/dct64_k7.c ????????????
@@ -15,7 +15,7 @@
static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
static float attribute_used plus_1f = 1.0;
-void dct64_MMX_3dnowex(real *a,real *b,real *c)
+void dct64_MMX_3dnowex(short *a,short *b,real *c)
{
char tmp[256];
__asm __volatile(
Index: mp3lib/tabinit.c
===================================================================
--- mp3lib/tabinit.c ?????? 23382??
+++ mp3lib/tabinit.c ????????????
@@ -8,7 +8,7 @@
static real cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
real *mp3lib_pnts[]={ cos64,cos32,cos16,cos8,cos4 };
-static long intwinbase[] = {
+static int intwinbase[] = {
0, -1, -1, -1, -1, -1, -1, -2, -2, -2,
-2, -3, -3, -4, -4, -5, -5, -6, -7, -7,
-8, -9, -10, -11, -13, -14, -16, -17, -19, -21,
@@ -36,9 +36,9 @@
64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
73415, 73908, 74313, 74630, 74856, 74992, 75038 };
-static void make_decode_tables(long scaleval)
+static void make_decode_tables()
{
- int i,j,k,kr,divv;
+ int i,j,k,kr,divv,scaleval=32768;
real *table,*costab;
More information about the MPlayer-dev-eng
mailing list