[FFmpeg-devel] [RFC] Alpha support
Cédric Schieli
cschieli
Sun Feb 1 16:52:02 CET 2009
[...]
> - case PIX_FMT_RGB32:\
> > - case PIX_FMT_BGR32:\
> > - case PIX_FMT_RGB32_1:\
> > - case PIX_FMT_BGR32_1:\
> > - func(uint32_t)\
> > - ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
> > - ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
> > - } \
> > + case PIX_FMT_RGBA:\
> > + case PIX_FMT_BGRA:\
> > + if (!CONFIG_SMALL && c->alpPixBuf)\
> > + {\
> > + func(uint32_t,1)\
> > + ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] +
> (A1<<24);\
> > + ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] +
> (A2<<24);\
> > + }\
> > + }else{\
> > + func(uint32_t,CONFIG_SMALL)\
> > + ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] +
> (CONFIG_SMALL ? (A1<<24) : 0);\
> > + ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] +
> (CONFIG_SMALL ? (A2<<24) : 0);\
> > + }\
> > + }\
> > + break;\
> > + case PIX_FMT_ARGB:\
> > + case PIX_FMT_ABGR:\
>
> is it faster the way you wrote it compared to a table that does <<24 vs.
> <<0 ?
> iam asking because the table would lead to simpler and less duplicated code
>
I've tried the table approach with this :
SwsContext *sws_getContext(...){
...
if (c->alpPixBuf){
int j = (c->dstFormat == PIX_FMT_RGBA || c->dstFormat ==
PIX_FMT_BGRA) ? 24 : 0;
for(i=0; i<256; i++)
c->table_A[i] = (i<<j);
}else{
memset(c->table_A, 0, sizeof(c->table_A));
}
...
}
#define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
switch(c->dstFormat)\
{\
case PIX_FMT_RGB32:\
case PIX_FMT_BGR32:\
case PIX_FMT_RGB32_1:\
case PIX_FMT_BGR32_1:\
if (!CONFIG_SMALL && c->alpPixBuf){\
uint32_t *a = c->table_A;\
func(uint32_t,1)\
START_TIMER\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + a[A1];\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + a[A2];\
STOP_TIMER(__func__)\
}\
}else{\
int needAlpha = (int)c->alpPixBuf;\
uint32_t *a = c->table_A;\
func(uint32_t,CONFIG_SMALL ? needAlpha : 0)\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] +
(CONFIG_SMALL ? a[A1] : 0);\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] +
(CONFIG_SMALL ? a[A2] : 0);\
}\
}\
break;\
...
and benchmarked it with : (rgb32.png is a 640x400 RGB32 image containing
alpha information)
for i in $(seq 1 100); do ./ffmpeg -sws_flags +bitexact -i /tmp/rgb32.png -s
2048x8192 /tmp/out.png 2>&1 | grep dezi | tail -n 1; done
the result is an average of 420.82 dezicycles
in my original approach :
#define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
switch(c->dstFormat)\
{\
case PIX_FMT_RGBA:\
case PIX_FMT_BGRA:\
if (!CONFIG_SMALL && c->alpPixBuf)\
{\
func(uint32_t,1)\
START_TIMER\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
STOP_TIMER(__func__)\
}\
}else{\
int needAlpha = (int)c->alpPixBuf;\
func(uint32_t,CONFIG_SMALL ? needAlpha : 0)\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] +
(CONFIG_SMALL ? (A1<<24) : 0);\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] +
(CONFIG_SMALL ? (A2<<24) : 0);\
}\
}\
break;\
...
the same benchmark gives an average of 419.93 dezicycles
So there is a tiny (but still existing) penalty to the table approach.
Regards,
C?dric Schieli
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel at mplayerhq.hu
https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: benchmark-table.log
Type: text/x-log
Size: 5999 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090201/d5493194/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: benchmark-case.log
Type: text/x-log
Size: 5999 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090201/d5493194/attachment-0001.bin>
More information about the ffmpeg-devel
mailing list