[Mplayer-cvslog] CVS: main/postproc swscale.c,1.86,1.87 swscale_template.c,1.75,1.76 swscale.h,1.15,1.16
Michael Niedermayer
michael at mplayer.dev.hu
Mon Apr 1 16:01:24 CEST 2002
Update of /cvsroot/mplayer/main/postproc
In directory mplayer:/var/tmp.root/cvs-serv9806
Modified Files:
swscale.c swscale_template.c swscale.h
Log Message:
overread in the mmx2 horizontal scaler fixed
2% faster horizontal mmx2 scaler
Index: swscale.c
===================================================================
RCS file: /cvsroot/mplayer/main/postproc/swscale.c,v
retrieving revision 1.86
retrieving revision 1.87
diff -u -r1.86 -r1.87
--- swscale.c 21 Mar 2002 18:53:32 -0000 1.86
+++ swscale.c 1 Apr 2002 14:01:22 -0000 1.87
@@ -117,10 +117,6 @@
extern int verbose; // defined in mplayer.c
/*
NOTES
-
-known BUGS with known cause (no bugreports please!, but patches are welcome :) )
-horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
-
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
TODO
@@ -1020,12 +1016,17 @@
}
#ifdef ARCH_X86
-static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
+static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
{
- uint8_t *fragment;
- int imm8OfPShufW1;
- int imm8OfPShufW2;
- int fragmentLength;
+ uint8_t *fragmentA;
+ int imm8OfPShufW1A;
+ int imm8OfPShufW2A;
+ int fragmentLengthA;
+ uint8_t *fragmentB;
+ int imm8OfPShufW1B;
+ int imm8OfPShufW2B;
+ int fragmentLengthB;
+ int fragmentPos;
int xpos, i;
@@ -1037,22 +1038,18 @@
"jmp 9f \n\t"
// Begin
"0: \n\t"
- "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
- "movq %%mm0, %%mm1 \n\t"
- "psrlq $8, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
+ "movq (%%edx, %%eax), %%mm3 \n\t"
+ "movd (%%ecx, %%esi), %%mm0 \n\t"
+ "movd 1(%%ecx, %%esi), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
- "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
- "psrlw $9, %%mm3 \n\t"
"psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%ebx, %%eax), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
- "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
@@ -1071,13 +1068,54 @@
"subl %0, %2 \n\t"
"leal 9b, %3 \n\t"
"subl %0, %3 \n\t"
- :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
- "=r" (fragmentLength)
+
+
+ :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
+ "=r" (fragmentLengthA)
);
- xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
+ asm volatile(
+ "jmp 9f \n\t"
+ // Begin
+ "0: \n\t"
+ "movq (%%edx, %%eax), %%mm3 \n\t"
+ "movd (%%ecx, %%esi), %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "pshufw $0xFF, %%mm0, %%mm1 \n\t"
+ "1: \n\t"
+ "pshufw $0xFF, %%mm0, %%mm0 \n\t"
+ "2: \n\t"
+ "psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%ebx, %%eax), %%esi \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "psllw $7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+
+ "movq %%mm0, (%%edi, %%eax) \n\t"
- for(i=0; i<dstW/8; i++)
+ "addl $8, %%eax \n\t"
+ // End
+ "9: \n\t"
+// "int $3\n\t"
+ "leal 0b, %0 \n\t"
+ "leal 1b, %1 \n\t"
+ "leal 2b, %2 \n\t"
+ "decl %1 \n\t"
+ "decl %2 \n\t"
+ "subl %0, %1 \n\t"
+ "subl %0, %2 \n\t"
+ "leal 9b, %3 \n\t"
+ "subl %0, %3 \n\t"
+
+
+ :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
+ "=r" (fragmentLengthB)
+ );
+
+ xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
+ fragmentPos=0;
+
+ for(i=0; i<dstW/numSplits; i++)
{
int xx=xpos>>16;
@@ -1088,20 +1126,65 @@
int c=((xpos+xInc*2)>>16) - xx;
int d=((xpos+xInc*3)>>16) - xx;
- memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
+ filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
+ filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
+ filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
+ filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
+ filterPos[i/2]= xx;
+
+ if(d+1<4)
+ {
+ int maxShift= 3-(d+1);
+ int shift=0;
+
+ memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
+
+ funnyCode[fragmentPos + imm8OfPShufW1B]=
+ (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
+ funnyCode[fragmentPos + imm8OfPShufW2B]=
+ a | (b<<2) | (c<<4) | (d<<6);
+
+ if(i+3>=dstW) shift=maxShift; //avoid overread
+ else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
+
+ if(shift && i>=shift)
+ {
+ funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
+ funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
+ filterPos[i/2]-=shift;
+ }
+
+ fragmentPos+= fragmentLengthB;
+ }
+ else
+ {
+ int maxShift= 3-d;
+ int shift=0;
+
+ memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
- funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
- funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
- a | (b<<2) | (c<<4) | (d<<6);
+ funnyCode[fragmentPos + imm8OfPShufW1A]=
+ funnyCode[fragmentPos + imm8OfPShufW2A]=
+ a | (b<<2) | (c<<4) | (d<<6);
- // if we dont need to read 8 bytes than dont :), reduces the chance of
- // crossing a cache line
- if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
+ if(i+4>=dstW) shift=maxShift; //avoid overread
+ else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
- funnyCode[fragmentLength*(i+4)/4]= RET;
+ if(shift && i>=shift)
+ {
+ funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
+ funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
+ filterPos[i/2]-=shift;
+ }
+
+ fragmentPos+= fragmentLengthA;
+ }
+
+ funnyCode[fragmentPos]= RET;
}
xpos+=xInc;
}
+ filterPos[i/2]= xpos>>16; // needed to jump to the next part
}
#endif // ARCH_X86
@@ -1565,8 +1648,13 @@
// cant downscale !!!
if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
{
- initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
- initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
+ c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
+ c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
+ c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
+ c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
+
+ initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
+ initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
}
#endif
} // Init Horizontal stuff
@@ -2013,6 +2101,15 @@
c->lumMmxFilter = NULL;
if(c->chrMmxFilter) free(c->chrMmxFilter);
c->chrMmxFilter = NULL;
+
+ if(c->lumMmx2Filter) free(c->lumMmx2Filter);
+ c->lumMmx2Filter=NULL;
+ if(c->chrMmx2Filter) free(c->chrMmx2Filter);
+ c->chrMmx2Filter=NULL;
+ if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
+ c->lumMmx2FilterPos=NULL;
+ if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
+ c->chrMmx2FilterPos=NULL;
free(c);
}
Index: swscale_template.c
===================================================================
RCS file: /cvsroot/mplayer/main/postproc/swscale_template.c,v
retrieving revision 1.75
retrieving revision 1.76
diff -u -r1.75 -r1.76
--- swscale_template.c 3 Mar 2002 13:33:40 -0000 1.75
+++ swscale_template.c 1 Apr 2002 14:01:22 -0000 1.76
@@ -2238,7 +2238,8 @@
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
- int srcFormat, uint8_t *formatConvBuffer)
+ int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+ int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
@@ -2294,35 +2295,21 @@
{
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
- "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
- "movd %5, %%mm6 \n\t" // xInc&0xFFFF
- "punpcklwd %%mm6, %%mm6 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
- "movq %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t"
- "paddw %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t"
- "paddw %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
- "movq %%mm2, %%mm4 \n\t"
- "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
- "punpcklwd %%mm6, %%mm6 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
+ "movl %0, %%ecx \n\t"
+ "movl %1, %%edi \n\t"
+ "movl %2, %%edx \n\t"
+ "movl %3, %%ebx \n\t"
"xorl %%eax, %%eax \n\t" // i
- "movl %0, %%esi \n\t" // src
- "movl %1, %%edi \n\t" // buf1
- "movl %3, %%edx \n\t" // (xInc*4)>>16
- "xorl %%ecx, %%ecx \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
+ PREFETCH" (%%ecx) \n\t"
+ PREFETCH" 32(%%ecx) \n\t"
+ PREFETCH" 64(%%ecx) \n\t"
#define FUNNY_Y_CODE \
- PREFETCH" 1024(%%esi) \n\t"\
- PREFETCH" 1056(%%esi) \n\t"\
- PREFETCH" 1088(%%esi) \n\t"\
- "call *%6 \n\t"\
- "movq %%mm4, %%mm2 \n\t"\
- "xorl %%ecx, %%ecx \n\t"
+ "movl (%%ebx), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "addl (%%ebx, %%eax), %%ecx \n\t"\
+ "addl %%eax, %%edi \n\t"\
+ "xorl %%eax, %%eax \n\t"\
FUNNY_Y_CODE
FUNNY_Y_CODE
@@ -2333,8 +2320,8 @@
FUNNY_Y_CODE
FUNNY_Y_CODE
- :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
- "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
+ :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+ "m" (funnyYCode)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
@@ -2402,7 +2389,8 @@
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
- int srcFormat, uint8_t *formatConvBuffer)
+ int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+ int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
@@ -2469,65 +2457,44 @@
if(canMMX2BeUsed)
{
asm volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
- "movd %5, %%mm6 \n\t" // xInc&0xFFFF
- "punpcklwd %%mm6, %%mm6 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
- "movq %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t"
- "paddw %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t"
- "paddw %%mm6, %%mm2 \n\t"
- "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
- "movq %%mm2, %%mm4 \n\t"
- "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
- "punpcklwd %%mm6, %%mm6 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
- "xorl %%eax, %%eax \n\t" // i
- "movl %0, %%esi \n\t" // src
- "movl %1, %%edi \n\t" // buf1
- "movl %3, %%edx \n\t" // (xInc*4)>>16
- "xorl %%ecx, %%ecx \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
-
-#define FUNNYUVCODE \
- PREFETCH" 1024(%%esi) \n\t"\
- PREFETCH" 1056(%%esi) \n\t"\
- PREFETCH" 1088(%%esi) \n\t"\
- "call *%7 \n\t"\
- "movq %%mm4, %%mm2 \n\t"\
- "xorl %%ecx, %%ecx \n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
- "xorl %%eax, %%eax \n\t" // i
- "movl %6, %%esi \n\t" // src
- "movl %1, %%edi \n\t" // buf1
- "addl $4096, %%edi \n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
- :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
- "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
- : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
- );
+ "pxor %%mm7, %%mm7 \n\t"
+ "movl %0, %%ecx \n\t"
+ "movl %1, %%edi \n\t"
+ "movl %2, %%edx \n\t"
+ "movl %3, %%ebx \n\t"
+ "xorl %%eax, %%eax \n\t" // i
+ PREFETCH" (%%ecx) \n\t"
+ PREFETCH" 32(%%ecx) \n\t"
+ PREFETCH" 64(%%ecx) \n\t"
+
+#define FUNNY_UV_CODE \
+ "movl (%%ebx), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "addl (%%ebx, %%eax), %%ecx \n\t"\
+ "addl %%eax, %%edi \n\t"\
+ "xorl %%eax, %%eax \n\t"\
+
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+ "xorl %%eax, %%eax \n\t" // i
+ "movl %5, %%ecx \n\t" // src
+ "movl %1, %%edi \n\t" // buf1
+ "addl $4096, %%edi \n\t"
+ PREFETCH" (%%ecx) \n\t"
+ PREFETCH" 32(%%ecx) \n\t"
+ PREFETCH" 64(%%ecx) \n\t"
+
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+
+ :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+ "m" (funnyUVCode), "m" (src2)
+ : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
+ );
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
{
// printf("%d %d %d\n", dstWidth, i, srcW);
@@ -2749,7 +2716,8 @@
// printf("%d %d\n", lumBufIndex, vLumBufSize);
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
- funnyYCode, c->srcFormat, formatConvBuffer);
+ funnyYCode, c->srcFormat, formatConvBuffer,
+ c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf < lastChrSrcY)
@@ -2763,7 +2731,8 @@
//FIXME replace parameters through context struct (some at least)
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
- funnyUVCode, c->srcFormat, formatConvBuffer);
+ funnyUVCode, c->srcFormat, formatConvBuffer,
+ c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
@@ -2787,7 +2756,8 @@
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
- funnyYCode, c->srcFormat, formatConvBuffer);
+ funnyYCode, c->srcFormat, formatConvBuffer,
+ c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
@@ -2800,7 +2770,8 @@
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
- funnyUVCode, c->srcFormat, formatConvBuffer);
+ funnyUVCode, c->srcFormat, formatConvBuffer,
+ c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
Index: swscale.h
===================================================================
RCS file: /cvsroot/mplayer/main/postproc/swscale.h,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -r1.15 -r1.16
--- swscale.h 6 Feb 2002 20:52:14 -0000 1.15
+++ swscale.h 1 Apr 2002 14:01:22 -0000 1.16
@@ -69,6 +69,10 @@
uint8_t __attribute__((aligned(32))) funnyYCode[10000];
uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
+ int32_t *lumMmx2FilterPos;
+ int32_t *chrMmx2FilterPos;
+ int16_t *lumMmx2Filter;
+ int16_t *chrMmx2Filter;
int canMMX2BeUsed;
More information about the MPlayer-cvslog
mailing list