[FFmpeg-devel] [PATCH] swscale alpha channel support
Michael Niedermayer
michaelni
Thu Mar 5 22:25:44 CET 2009
On Thu, Mar 05, 2009 at 03:09:26PM +0100, C?dric Schieli wrote:
> 2009/3/2 Michael Niedermayer <michaelni at gmx.at>:
> > On Fri, Feb 27, 2009 at 11:30:25PM +0100, C?dric Schieli wrote:
>
> [...]
>
> >> @@ -972,6 +986,14 @@
> >> ? ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> ? ? ? ? ? ? ? ? ?);
> >> ? ? ? ? ? ? ?}
> >> + ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && aDest){
> >> + ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2YV121_ACCURATE
> >> + ? ? ? ? ? ? ? ? ? ?:: "r" (alpSrc+dstW), "r" (aDest+dstW),
> >> + ? ? ? ? ? ? ? ? ? ?"g" (-dstW)
> >> + ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> + ? ? ? ? ? ? ? ?);
> >> + ? ? ? ? ? ?}
> >> ? ? ? ? ?}else{
> >> ? ? ? ? ? ? ?while(p--){
> >> ? ? ? ? ? ? ? ? ?__asm__ volatile(
> >> @@ -981,6 +1003,14 @@
> >> ? ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> ? ? ? ? ? ? ? ? ?);
> >> ? ? ? ? ? ? ?}
> >> + ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && aDest){
> >> + ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2YV121
> >> + ? ? ? ? ? ? ? ? ? ?:: "r" (alpSrc+dstW), "r" (aDest+dstW),
> >> + ? ? ? ? ? ? ? ? ? ?"g" (-dstW)
> >> + ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> + ? ? ? ? ? ? ? ?);
> >> + ? ? ? ? ? ?}
> >> ? ? ? ? ?}
> >> ? ? ? ? ?return;
> >> ? ? ?}
> >
> > i would prefer if these wouldnt be duplicated in the generated code
>
> patch updated
>
> > [...]
> >> @@ -1095,11 +1147,28 @@
> >> ? ? ? ? ? ? ?switch(c->dstFormat)
> >> ? ? ? ? ? ? ?{
> >> ? ? ? ? ? ? ?case PIX_FMT_RGB32:
> >> + ? ? ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2PACKEDX
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGBX
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm2, "U_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm4, "V_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm5, "Y_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET)
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ? ? ? ?$3, %%mm1 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ? ? ? ?$3, %%mm7 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"packuswb ? ? ? ? ? ? ? ? ?%%mm7, %%mm1 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "U_TEMP"(%0), %%mm2 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "V_TEMP"(%0), %%mm4 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "Y_TEMP"(%0), %%mm5 ? ? ? ? \n\t"
> >
> > it seems that YSCALEYUV2PACKEDX_YA could be changed to take teh registers as
> > parameters to avoid the movq TEMP stuff ?
> > also same applies to all other cases where its possible
>
> patch updated
>
> > [...]
> >> @@ -1191,6 +1260,32 @@
> >> ? ? ? ? ?{
> >> ? ? ? ? ? ? ?//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
> >> ? ? ? ? ? ? ?case PIX_FMT_RGB32:
> >> + ? ? ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> >> + ? ? ? ? ? ? ? ? ? ?*(uint16_t **)(&c->u_temp)=abuf0;
> >> + ? ? ? ? ? ? ? ? ? ?*(uint16_t **)(&c->v_temp)=abuf1;
> >> + ? ? ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?"mov %%"REG_b", "ESP_OFFSET"(%5) ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ?%4, %%"REG_b" ? ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"push %%"REG_BP" ? ? ? ? ? ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGB(%%REGBP, %5)
> >> + ? ? ? ? ? ? ? ? ? ?"push ? ? ? ? ? ? ? ? ? %0 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"push ? ? ? ? ? ? ? ? ? %1 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ? ?"U_TEMP"(%5), %0 ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ? ?"V_TEMP"(%5), %1 ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGB_YA(%%REGBP, %5)
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ?$3, %%mm1 ? ? ? \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ?$3, %%mm7 ? ? ? \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> >> + ? ? ? ? ? ? ? ? ? ?"packuswb ? ? ? ? ? ?%%mm7, %%mm1 ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"pop ? ? ? ? ? ? ? ? ? ?%1 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"pop ? ? ? ? ? ? ? ? ? ?%0 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
> >> + ? ? ? ? ? ? ? ? ? ?"pop %%"REG_BP" ? ? ? ? ? ? ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov "ESP_OFFSET"(%5), %%"REG_b" ? ? ? ?\n\t"
> >> +
> >> + ? ? ? ? ? ? ? ? ? ?:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
> >> + ? ? ? ? ? ? ? ? ? ?"a" (&c->redDither)
> >
> > the push/pop in the inner loop can at least on x86_64 be avoided
>
> patch updated
>
> > [...]
> >
> >> +static inline void RENAME(bgr32ToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
> >> + ? ?int i;
> >> + ? ?for (i=0; i<width; i++){
> >> +#ifdef WORDS_BIGENDIAN
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]&0xFF;
> >> +#else
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]>>24;
> >> +#endif
> >> + ? ?}
> >> +}
> >> +
> >> +static inline void RENAME(bgr32_1ToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
> >> + ? ?int i;
> >> + ? ?for (i=0; i<width; i++){
> >> +#ifdef WORDS_BIGENDIAN
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]>>24;
> >> +#else
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]&0xFF;
> >> +#endif
> >> + ? ?}
> >> +}
> >> +
> >
> > for(i=0; i<w; i++)
> > ? ?dst[i]= src[4*i];
> > and adjust src when calling
> > 1 function less
>
> patch updated
>
>
> Here is the updated patch set :
>
> #1 : sws_yuva420p_isPlanarYUV.patch
> YUVA420P is a planar YUV format, isn't it ?
yes, patch ok
>
> #2 : sws_scale_use_4_planes.patch
> don't ignore 4th plane
ok
[...]
> #5 : sws_configure_alpha.patch
> add --swscale-alpha configure option
not maintained by me
>
> #6 : sws_yuva2rgb.patch
> updated to have a mmx version even without HAVE_7REGS
I would prefer to have simple code over working around gcc bugs
with #ifdefs
>
> #7 : sws_scale_alpha.patch
> updated
comments below
>
> #8 : sws_output_yuva420p.patch
> updated
comments below
>
> #9 : swscale-example_use_alpha.patch
> updated
> the second hunk in this one needs some explanations :
> without it, swscale-example will segfault
> even without any of my patches, simply adding a uint16_t** field to
> struct SwsContext and av_malloc'ing between 13 and 236 bytes (or 26
> and 408 on x86_64) will make swscale-example to segfault (see
> bug.diff)
> after some debugging, I found that my alpPixBuf data get corrupted somehow
> disabling MMX code with --disable-mmx prevent this from happening
> running swscale-example under valgrind (even without bug.diff) will segfault
> can someone can help on this ?
in swscale_internal.h there are #defines that define the offsets of
various fields in the struct so the asm can access them, you dont maybe
mess that up by adding a field?
and if valgrind itself segfaults, that should be reported to the valgrind
devs ...
#7
[...]
> @@ -1191,6 +1254,52 @@
> {
> //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
> case PIX_FMT_RGB32:
> + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> +#if !ARCH_X86_64
> + *(uint16_t **)(&c->u_temp)=abuf0;
> + *(uint16_t **)(&c->v_temp)=abuf1;
> +#endif
> + __asm__ volatile(
> +#if !ARCH_X86_64
> + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
> +#endif
> + "mov %4, %%"REG_b" \n\t"
> +#if !ARCH_X86_64
> + "push %%"REG_BP" \n\t"
> +#endif
> + YSCALEYUV2RGB(%%REGBP, %5)
> +#if !ARCH_X86_64
> + "push %0 \n\t"
> + "push %1 \n\t"
> + "mov "U_TEMP"(%5), %0 \n\t"
> + "mov "V_TEMP"(%5), %1 \n\t"
> +#endif
> +#if ARCH_X86_64
> + YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
> +#else
> + YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
> +#endif
> + "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> + "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> + "packuswb %%mm7, %%mm1 \n\t"
> +#if !ARCH_X86_64
> + "pop %1 \n\t"
> + "pop %0 \n\t"
> +#endif
> + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
> +#if !ARCH_X86_64
> + "pop %%"REG_BP" \n\t"
> + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
> +#endif
> +
> + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
> + "a" (&c->redDither)
> +#if ARCH_X86_64
> + ,"r" (abuf0), "r" (abuf1)
> + : "%"REG_b, "%"REG_BP
> +#endif
> + );
> + }else{
> __asm__ volatile(
> "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
> "mov %4, %%"REG_b" \n\t"
ehm, this looks really messy ...
[...]
> @@ -2144,11 +2293,17 @@
> }
> else if (srcFormat==PIX_FMT_RGB32)
> {
> + if (isAlpha)
> + RENAME(bgr32ToA)(formatConvBuffer, src+3, srcW, pal);
> + else
> RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
> src= formatConvBuffer;
> }
> else if (srcFormat==PIX_FMT_RGB32_1)
> {
> + if (isAlpha)
> + RENAME(bgr32ToA)(formatConvBuffer, src, srcW, pal);
> + else
> RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
> src= formatConvBuffer;
> }
> @@ -2169,11 +2324,17 @@
> }
> else if (srcFormat==PIX_FMT_BGR32)
> {
> + if (isAlpha)
> + RENAME(bgr32ToA)(formatConvBuffer, src+3, srcW, pal);
> + else
> RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
> src= formatConvBuffer;
> }
> else if (srcFormat==PIX_FMT_BGR32_1)
> {
> + if (isAlpha)
> + RENAME(bgr32ToA)(formatConvBuffer, src, srcW, pal);
> + else
> RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
> src= formatConvBuffer;
> }
This doesnt look like it would work on big endian systems
[...]
>
> + if (CONFIG_SWSCALE_ALPHA && (dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
> + memset(dst[3], 255, dstStride[3]*(dstY-lastDstY));
> +
you cant write outside 0..width per line, also stride can be negative
[...]
#8
> Index: ffmpeg/libswscale/swscale.c
> ===================================================================
> --- ffmpeg.orig/libswscale/swscale.c 2009-03-03 09:31:07.000000000 +0100
> +++ ffmpeg/libswscale/swscale.c 2009-03-03 10:00:28.000000000 +0100
> @@ -133,6 +133,7 @@
> )
> #define isSupportedOut(x) ( \
> (x)==PIX_FMT_YUV420P \
> + || (x)==PIX_FMT_YUVA420P \
> || (x)==PIX_FMT_YUYV422 \
> || (x)==PIX_FMT_UYVY422 \
> || (x)==PIX_FMT_YUV444P \
> @@ -2053,12 +2054,16 @@
> int srcSliceH, uint8_t* dst[], int dstStride[])
> {
> int plane;
> - for (plane=0; plane<3; plane++)
> + for (plane=0; plane<4; plane++)
> {
> - int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
> - int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
> - int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
> -
> + int length= (plane==0 || plane==3) ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
> + int y= (plane==0 || plane==3) ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
> + int height= (plane==0 || plane==3) ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
> +
> + if (((!isALPHA(c->srcFormat)) || (!isALPHA(c->dstFormat))) && plane == 3){
!(isALPHA(c->srcFormat) && isALPHA(c->dstFormat)) && plane == 3
> + if (isALPHA(c->dstFormat))
> + memset(dst[3], 255, dstStride[3]*height);
as said elewhere writing must be limited to 0..width
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Dictatorship naturally arises out of democracy, and the most aggravated
form of tyranny and slavery out of the most extreme liberty. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090305/808a0fed/attachment.pgp>
More information about the ffmpeg-devel
mailing list