[FFmpeg-devel] [PATCH] swscale alpha channel support

Michael Niedermayer michaelni
Sat Feb 28 00:32:04 CET 2009


On Fri, Feb 27, 2009 at 11:30:25PM +0100, C?dric Schieli wrote:
> 2009/2/27 Michael Niedermayer <michaelni at gmx.at>:
> > On Fri, Feb 27, 2009 at 06:28:38PM +0100, C?dric Schieli wrote:
> >> Hi all,
> >>
> [...]
> >>
> >> #3 : sws_treat_yuva420p_as_yuv420p.patch
> >> Use (srcFormat==yuv420p) codepath for the (srcFormat=yuva420p) case in
> >> various places where it is appropriate
> >
> > ok
> 
> Applied
> 

> >
> >
> >>
> >> #4 : sws_default_alpha_value.patch
> >> When converting from a non alpha format to an alpha format, defaults
> >> to all ones rather than all zeroes
> >> This patch introduces some (small) differences in swscale-exemple
> >> output for RGB32_1 and BGR32_1 (see swscale-example.log.diff) that I
> >> can't reproduce manually
> >
> > these look rather harmless, maybe some uninitialized var or missing emms,
> > i doubt there is a bug in your code
> > anyway comments below
> >
> 
> It can't be missing emms as the same differences occurs when built
> with --disable-mmx

does valgrind say something?


> 
> [...]
> >>
> >> #8 : sws_output_yuva420p.patch
> >> Now that yuva420p can be scaled, it can be added to output supported format
> >
> > ok
> 
> This one makes swscale-example crash if #9 is not applied, so it will wait
> 
> [...]
> >>
> >>
> >> One remaining issue is a strange difference on x86_64 for some cases
> >> (see swscale-example.x86_64.log.diff). After some debugging, it seems
> >> it comes from #9. I'll invetigate more on this one.
> >
> > until that is solved #9 (or whatever causes it) is not acceptable
> > what has to be noted is that all of the differences are for flags=1
> > and involve either yuv410p or yuv411p
> >
> 
> Yes, that's what I noticed too.
> I've adapted it to work without the others applied, and the
> differences remain, so the bug is either in the patch itself, or
> already in libswscale
> 
> > [...]
> >
> >
> >
> > #2
> >> Index: ffmpeg/libavcodec/imgconvert.c
> >> ===================================================================
> >> --- ffmpeg.orig/libavcodec/imgconvert.c ? ? ? 2009-02-27 11:35:30.086183618 +0100
> >> +++ ffmpeg/libavcodec/imgconvert.c ? ?2009-02-27 11:35:49.874181190 +0100
> >> @@ -721,7 +721,7 @@ int avpicture_layout(const AVPicture* sr
> >> ? ? ? ? ? ? ? ? ? ? ? unsigned char *dest, int dest_size)
> >> ?{
> >> ? ? ?const PixFmtInfo* pf = &pix_fmt_info[pix_fmt];
> >> - ? ?int i, j, w, h, data_planes;
> >> + ? ?int i, j, w, ow, h, oh, data_planes;
> >> ? ? ?const unsigned char* s;
> >> ? ? ?int size = avpicture_get_size(pix_fmt, width, height);
> >>
> >
> >> @@ -753,8 +753,13 @@ int avpicture_layout(const AVPicture* sr
> >>
> >> ? ? ?for (i=0; i<data_planes; i++) {
> >> ? ? ? ? ? if (i == 1) {
> >> + ? ? ? ? ? ? ow = w;
> >> + ? ? ? ? ? ? oh = h;
> >
> > id do this before the loop
> >
> 
> That doesn't change much, but why not.
> Patch updated.
> 
> > [...]
> >
> >
> > #4
> > [...]
> >> @@ -1661,11 +1661,17 @@ static inline void RENAME(name ## _half)
> >> ? ? ?int i;\
> >> ? ? ?for (i=0; i<width; i++)\
> >> ? ? ?{\
> >> - ? ? ? ?int pix0= ((type*)src)[2*i+0];\
> >> - ? ? ? ?int pix1= ((type*)src)[2*i+1];\
> >> - ? ? ? ?int g= (pix0&maskg)+(pix1&maskg);\
> >> - ? ? ? ?int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
> >> - ? ? ? ?int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
> >> + ? ? ? ?int pix0, pix1, g, b, r;\
> >> + ? ? ? ?if (alpha){\
> >> + ? ? ? ? ? ?pix0= ((type*)src)[2*i+0]&(maskr|maskg|maskb);\
> >> + ? ? ? ? ? ?pix1= ((type*)src)[2*i+1]&(maskr|maskg|maskb);\
> >> + ? ? ? ?}else{\
> >> + ? ? ? ? ? ?pix0= ((type*)src)[2*i+0];\
> >> + ? ? ? ? ? ?pix1= ((type*)src)[2*i+1];\
> >> + ? ? ? ?}\
> >> + ? ? ? ?g= (pix0&maskg)+(pix1&maskg);\
> >> + ? ? ? ?b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
> >> + ? ? ? ?r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
> >
> > int pix0= ((type*)src)[2*i+0];\
> > int pix1= ((type*)src)[2*i+1];\
> > int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
> > int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
> > int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
> > g &= maskg|(2*maskg)
> >
> > 1 & less :)
> 
> Nice one.
> Patch updated.
> 
> [...]
> >> Index: ffmpeg/libswscale/swscale_internal.h
> >> ===================================================================
> >> --- ffmpeg.orig/libswscale/swscale_internal.h 2009-02-27 10:06:03.841935207 +0100
> >> +++ ffmpeg/libswscale/swscale_internal.h ? ? ?2009-02-27 11:36:09.826184312 +0100
> >> @@ -273,6 +273,13 @@ const char *sws_format_name(int format);
> >> ? ? ? ? ?|| (x)==PIX_FMT_MONOBLACK ? \
> >> ? ? ? ? ?|| (x)==PIX_FMT_MONOWHITE ? \
> >> ? ? ?)
> >> +#define isALPHA(x) ? ? ?( ? ? ? ? ? \
> >> + ? ? ? ? ? (x)==PIX_FMT_BGR32 ? ? ? \
> >> + ? ? ? ?|| (x)==PIX_FMT_BGR32_1 ? ? \
> >> + ? ? ? ?|| (x)==PIX_FMT_RGB32 ? ? ? \
> >> + ? ? ? ?|| (x)==PIX_FMT_RGB32_1 ? ? \
> >> + ? ? ? ?|| (x)==PIX_FMT_YUVA420P ? ?\
> >> + ? ?)
> >>
> >> ?static inline int fmt_depth(int fmt)
> >> ?{
> >
> > ok
> 
> Applied
> 
> >
> >
> > #6
> > [...]
> >> +#define YUV2RGBFUNC(func_name, dst_type, alpha) \
> >> ?static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
> >> ? ? ? ? ? ? ? ? ? ? ? int srcSliceH, uint8_t* dst[], int dstStride[]){\
> >> ? ? ?int y;\
> >> ?\
> >
> >> - ? ?if (c->srcFormat == PIX_FMT_YUV422P) {\
> >> + ? ?if (!alpha && c->srcFormat == PIX_FMT_YUV422P) {\
> >> ? ? ? ? ?srcStride[1] *= 2;\
> >> ? ? ? ? ?srcStride[2] *= 2;\
> >> ? ? ?}\
> >
> > why?
> 
> To optimize out the check in the alpha==1 case
> 
> >
> > [...]
> >> Index: ffmpeg/libswscale/yuv2rgb_template.c
> >> ===================================================================
> >> --- ffmpeg.orig/libswscale/yuv2rgb_template.c 2009-02-27 11:36:06.914181648 +0100
> >> +++ ffmpeg/libswscale/yuv2rgb_template.c ? ? ?2009-02-27 15:40:20.681930923 +0100
> >> @@ -451,3 +451,23 @@ static inline int RENAME(yuv420_rgb32)(S
> >>
> >> ? ? ?YUV2RGB_ENDLOOP(4)
> >> ?}
> >> +
> >> +#if CONFIG_SWSCALE_ALPHA
> >> +static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?int srcSliceH, uint8_t* dst[], int dstStride[]){
> >> + ? ?int y, h_size;
> >> +
> >> + ? ?YUV2RGB_LOOP(4)
> >> +
> >> + ? ? ? ?*(uint8_t **)(&c->u_temp) = src[3] + y*srcStride[3] - 2*index; /* pa-2index */
> >> + ? ? ? ?YUV2RGB_INIT
> >> + ? ? ? ?YUV2RGB
> >> + ? ? ? ?"mov ? ? ? ? ? ? ? %5, "ESP_OFFSET"(%4);" /* Backup py-2index */
> >> + ? ? ? ?"mov ? ? "U_TEMP"(%4), %5;" ? ? ? ? ? ? ? /* Fetch pa-2index */
> >> + ? ? ? ?"movq ? ? (%5, %0, 2), %%mm3;" ? ? ? ? ? ?/* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */
> >> + ? ? ? ?"mov "ESP_OFFSET"(%4), %5;" ? ? ? ? ? ? ? /* Restore py-2index */
> >> + ? ? ? ?RGB_PLANAR2PACKED32
> >> +
> >> + ? ?YUV2RGB_ENDLOOP(4)
> >> +}
> >> +#endif
> >
> > there are 7 registers the asm uses just 6 so this save&restore should be
> > avoidable even if we then have to fallback to C when HAVE_7REGS is not
> > set
> 
> Patch updated
> 
> 
> Regards,
> C?dric Schieli

> Index: ffmpeg/libavcodec/imgconvert.c
> ===================================================================
> --- ffmpeg.orig/libavcodec/imgconvert.c	2009-02-27 23:03:59.311598606 +0100
> +++ ffmpeg/libavcodec/imgconvert.c	2009-02-27 23:27:11.755632484 +0100
> @@ -721,7 +721,7 @@
>                       unsigned char *dest, int dest_size)
>  {
>      const PixFmtInfo* pf = &pix_fmt_info[pix_fmt];
> -    int i, j, w, h, data_planes;
> +    int i, j, w, ow, h, oh, data_planes;
>      const unsigned char* s;
>      int size = avpicture_get_size(pix_fmt, width, height);
>  
> @@ -751,10 +751,16 @@
>          h = height;
>      }
>  
> +    ow = w;
> +    oh = h;
> +
>      for (i=0; i<data_planes; i++) {
>           if (i == 1) {
>               w = width >> pf->x_chroma_shift;
>               h = height >> pf->y_chroma_shift;
> +         } else if (i == 3) {
> +             w = ow;
> +             h = oh;
>           }
>           s = src->data[i];
>           for(j=0; j<h; j++) {

patch ok


> Index: ffmpeg/libswscale/swscale_template.c
> ===================================================================
> --- ffmpeg.orig/libswscale/swscale_template.c	2009-02-27 23:01:40.355609018 +0100
> +++ ffmpeg/libswscale/swscale_template.c	2009-02-27 23:27:15.219633216 +0100
> @@ -1031,7 +1031,7 @@
>              case PIX_FMT_RGB32:
>                  YSCALEYUV2PACKEDX_ACCURATE
>                  YSCALEYUV2RGBX
> -                "pxor %%mm7, %%mm7 \n\t"
> +                "pcmpeqd %%mm7, %%mm7 \n\t"
>                  WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>  
>                  YSCALEYUV2PACKEDX_END
> @@ -1097,7 +1097,7 @@
>              case PIX_FMT_RGB32:
>                  YSCALEYUV2PACKEDX
>                  YSCALEYUV2RGBX
> -                "pxor %%mm7, %%mm7 \n\t"
> +                "pcmpeqd %%mm7, %%mm7 \n\t"
>                  WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>                  YSCALEYUV2PACKEDX_END
>                  return;
> @@ -1196,7 +1196,7 @@
>                  "mov        %4, %%"REG_b"               \n\t"
>                  "push %%"REG_BP"                        \n\t"
>                  YSCALEYUV2RGB(%%REGBP, %5)
> -                "pxor    %%mm7, %%mm7                   \n\t"
> +                "pcmpeqd %%mm7, %%mm7                   \n\t"
>                  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>                  "pop %%"REG_BP"                         \n\t"
>                  "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
> @@ -1312,7 +1312,7 @@
>                  "mov        %4, %%"REG_b"               \n\t"
>                  "push %%"REG_BP"                        \n\t"
>                  YSCALEYUV2RGB1(%%REGBP, %5)
> -                "pxor    %%mm7, %%mm7                   \n\t"
> +                "pcmpeqd %%mm7, %%mm7                   \n\t"
>                  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>                  "pop %%"REG_BP"                         \n\t"
>                  "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
> @@ -1405,7 +1405,7 @@
>                  "mov        %4, %%"REG_b"               \n\t"
>                  "push %%"REG_BP"                        \n\t"
>                  YSCALEYUV2RGB1b(%%REGBP, %5)
> -                "pxor    %%mm7, %%mm7                   \n\t"
> +                "pcmpeqd %%mm7, %%mm7                   \n\t"
>                  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>                  "pop %%"REG_BP"                         \n\t"
>                  "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
> @@ -1642,7 +1642,7 @@
>  BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
>  BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
>  
> -#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
> +#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
>  static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
>  {\
>      int i;\
> @@ -1663,9 +1663,10 @@
>      {\
>          int pix0= ((type*)src)[2*i+0];\
>          int pix1= ((type*)src)[2*i+1];\
> -        int g= (pix0&maskg)+(pix1&maskg);\
> +        int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
>          int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
>          int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
> +        g&= maskg|(2*maskg);\
>  \
>          g>>=shg;\
>  \
> @@ -1674,12 +1675,12 @@
>      }\
>  }
>  
> -BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
> -BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
> -BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
> -BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
> -BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
> -BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
> +BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
> +BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
> +BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
> +BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
> +BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
> +BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
>  
>  #if HAVE_MMX
>  static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
> Index: ffmpeg/libswscale/swscale.c
> ===================================================================
> --- ffmpeg.orig/libswscale/swscale.c	2009-02-27 23:27:08.024600467 +0100
> +++ ffmpeg/libswscale/swscale.c	2009-02-27 23:27:15.223632327 +0100
> @@ -922,12 +922,12 @@
>      switch(c->dstFormat){
>      case PIX_FMT_ARGB:
>          dest++;
> -        aidx= 0;
> +        aidx= -1;
>      case PIX_FMT_RGB24:
>          aidx--;
>      case PIX_FMT_RGBA:
>          YSCALE_YUV_2_RGBX_FULL_C(1<<21)
> -            dest[aidx]= 0;
> +            dest[aidx]= 255;
>              dest[0]= R>>22;
>              dest[1]= G>>22;
>              dest[2]= B>>22;
> @@ -936,12 +936,12 @@
>          break;
>      case PIX_FMT_ABGR:
>          dest++;
> -        aidx= 0;
> +        aidx= -1;
>      case PIX_FMT_BGR24:
>          aidx--;
>      case PIX_FMT_BGRA:
>          YSCALE_YUV_2_RGBX_FULL_C(1<<21)
> -            dest[aidx]= 0;
> +            dest[aidx]= 255;
>              dest[0]= B>>22;
>              dest[1]= G>>22;
>              dest[2]= R>>22;
> Index: ffmpeg/libswscale/rgb2rgb.c
> ===================================================================
> --- ffmpeg.orig/libswscale/rgb2rgb.c	2009-02-27 23:01:40.367598182 +0100
> +++ ffmpeg/libswscale/rgb2rgb.c	2009-02-27 23:27:15.223632327 +0100
> @@ -94,6 +94,7 @@
>  DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
>  DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
>  DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
> +DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
>  DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
>  DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
>  DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
> @@ -281,7 +282,7 @@
>      {
>          #ifdef WORDS_BIGENDIAN
>              /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */
> -            dst[4*i + 0] = 0;
> +            dst[4*i + 0] = 255;
>              dst[4*i + 1] = src[3*i + 0];
>              dst[4*i + 2] = src[3*i + 1];
>              dst[4*i + 3] = src[3*i + 2];
> @@ -289,7 +290,7 @@
>              dst[4*i + 0] = src[3*i + 2];
>              dst[4*i + 1] = src[3*i + 1];
>              dst[4*i + 2] = src[3*i + 0];
> -            dst[4*i + 3] = 0;
> +            dst[4*i + 3] = 255;
>          #endif
>      }
>  }
> @@ -305,7 +306,7 @@
>          register uint16_t bgr;
>          bgr = *s++;
>          #ifdef WORDS_BIGENDIAN
> -            *d++ = 0;
> +            *d++ = 255;
>              *d++ = (bgr&0x1F)<<3;
>              *d++ = (bgr&0x7E0)>>3;
>              *d++ = (bgr&0xF800)>>8;
> @@ -313,7 +314,7 @@
>              *d++ = (bgr&0xF800)>>8;
>              *d++ = (bgr&0x7E0)>>3;
>              *d++ = (bgr&0x1F)<<3;
> -            *d++ = 0;
> +            *d++ = 255;
>          #endif
>      }
>  }
> @@ -369,7 +370,7 @@
>          register uint16_t bgr;
>          bgr = *s++;
>          #ifdef WORDS_BIGENDIAN
> -            *d++ = 0;
> +            *d++ = 255;
>              *d++ = (bgr&0x1F)<<3;
>              *d++ = (bgr&0x3E0)>>2;
>              *d++ = (bgr&0x7C00)>>7;
> @@ -377,7 +378,7 @@
>              *d++ = (bgr&0x7C00)>>7;
>              *d++ = (bgr&0x3E0)>>2;
>              *d++ = (bgr&0x1F)<<3;
> -            *d++ = 0;
> +            *d++ = 255;
>          #endif
>      }
>  }
> Index: ffmpeg/libswscale/rgb2rgb_template.c
> ===================================================================
> --- ffmpeg.orig/libswscale/rgb2rgb_template.c	2009-02-27 23:01:40.375627344 +0100
> +++ ffmpeg/libswscale/rgb2rgb_template.c	2009-02-27 23:27:15.227630601 +0100
> @@ -83,7 +83,7 @@
>      #if HAVE_MMX
>          __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
>          mm_end = end - 23;
> -        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
> +        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
>          while (s < mm_end)
>          {
>              __asm__ volatile(
> @@ -96,10 +96,10 @@
>              "punpckldq   15%1, %%mm2    \n\t"
>              "movd        18%1, %%mm3    \n\t"
>              "punpckldq   21%1, %%mm3    \n\t"
> -            "pand       %%mm7, %%mm0    \n\t"
> -            "pand       %%mm7, %%mm1    \n\t"
> -            "pand       %%mm7, %%mm2    \n\t"
> -            "pand       %%mm7, %%mm3    \n\t"
> +            "por        %%mm7, %%mm0    \n\t"
> +            "por        %%mm7, %%mm1    \n\t"
> +            "por        %%mm7, %%mm2    \n\t"
> +            "por        %%mm7, %%mm3    \n\t"
>              MOVNTQ"     %%mm0,   %0     \n\t"
>              MOVNTQ"     %%mm1,  8%0     \n\t"
>              MOVNTQ"     %%mm2, 16%0     \n\t"
> @@ -117,7 +117,7 @@
>      {
>      #ifdef WORDS_BIGENDIAN
>          /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
> -        *dest++ = 0;
> +        *dest++ = 255;
>          *dest++ = s[2];
>          *dest++ = s[1];
>          *dest++ = s[0];
> @@ -126,7 +126,7 @@
>          *dest++ = *s++;
>          *dest++ = *s++;
>          *dest++ = *s++;
> -        *dest++ = 0;
> +        *dest++ = 255;
>      #endif
>      }
>  }

hunks from start of patch to here ok


> @@ -1213,6 +1213,7 @@
>      end = s + src_size/2;
>  #if HAVE_MMX
>      __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
> +    __asm__ volatile("movq  %0,%%mm6"::"m"(mask32a):"memory");
>      __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
>      mm_end = end - 3;
>      while (s < mm_end)
> @@ -1245,6 +1246,8 @@
>          "psllq        $16, %%mm5    \n\t"
>          "por        %%mm4, %%mm3    \n\t"
>          "por        %%mm5, %%mm3    \n\t"
> +        "por        %%mm6, %%mm0    \n\t"
> +        "por        %%mm6, %%mm3    \n\t"
>          MOVNTQ"     %%mm0,  %0      \n\t"
>          MOVNTQ"     %%mm3, 8%0      \n\t"
>          :"=m"(*d)

the code can be implemented with
significantly fewer instructions (as shows in my previous review)
and at the same time supporting alpha = 0xFF


> @@ -1265,7 +1268,7 @@
>          register uint16_t bgr;
>          bgr = *s++;
>  #ifdef WORDS_BIGENDIAN
> -        *d++ = 0;
> +        *d++ = 255;
>          *d++ = (bgr&0x7C00)>>7;
>          *d++ = (bgr&0x3E0)>>2;
>          *d++ = (bgr&0x1F)<<3;
> @@ -1273,7 +1276,7 @@
>          *d++ = (bgr&0x1F)<<3;
>          *d++ = (bgr&0x3E0)>>2;
>          *d++ = (bgr&0x7C00)>>7;
> -        *d++ = 0;
> +        *d++ = 255;
>  #endif
>  
>  #endif

2 hunks ok


> @@ -1291,6 +1294,7 @@
>      end = s + src_size/2;
>  #if HAVE_MMX
>      __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
> +    __asm__ volatile("movq  %0,%%mm6"::"m"(mask32a):"memory");
>      __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
>      mm_end = end - 3;
>      while (s < mm_end)
> @@ -1323,6 +1327,8 @@
>          "psllq        $16, %%mm5    \n\t"
>          "por        %%mm4, %%mm3    \n\t"
>          "por        %%mm5, %%mm3    \n\t"
> +        "por        %%mm6, %%mm0    \n\t"
> +        "por        %%mm6, %%mm3    \n\t"
>          MOVNTQ"     %%mm0, %0       \n\t"
>          MOVNTQ"     %%mm3, 8%0      \n\t"
>          :"=m"(*d)

same issue


> @@ -1339,7 +1345,7 @@
>          register uint16_t bgr;
>          bgr = *s++;
>  #ifdef WORDS_BIGENDIAN
> -        *d++ = 0;
> +        *d++ = 255;
>          *d++ = (bgr&0xF800)>>8;
>          *d++ = (bgr&0x7E0)>>3;
>          *d++ = (bgr&0x1F)<<3;
> @@ -1347,7 +1353,7 @@
>          *d++ = (bgr&0x1F)<<3;
>          *d++ = (bgr&0x7E0)>>3;
>          *d++ = (bgr&0xF800)>>8;
> -        *d++ = 0;
> +        *d++ = 255;
>  #endif
>      }
>  }
> Index: ffmpeg/libswscale/yuv2rgb_template.c
> ===================================================================
> --- ffmpeg.orig/libswscale/yuv2rgb_template.c	2009-02-27 23:01:40.379598031 +0100
> +++ ffmpeg/libswscale/yuv2rgb_template.c	2009-02-27 23:27:15.227630601 +0100
> @@ -446,7 +446,7 @@
>  
>          YUV2RGB_INIT
>          YUV2RGB
> -        "pxor      %%mm3, %%mm3;"   /* zero mm3 */
> +        "pcmpeqd   %%mm3, %%mm3;"   /* fill mm3 */
>          RGB_PLANAR2PACKED32
>  
>      YUV2RGB_ENDLOOP(4)
> Index: ffmpeg/libswscale/yuv2rgb.c
> ===================================================================
> --- ffmpeg.orig/libswscale/yuv2rgb.c	2009-02-27 23:01:40.387598140 +0100
> +++ ffmpeg/libswscale/yuv2rgb.c	2009-02-27 23:27:15.227630601 +0100
> @@ -533,7 +533,7 @@
>      uint8_t *y_table;
>      uint16_t *y_table16;
>      uint32_t *y_table32;
> -    int i, base, rbase, gbase, bbase;
> +    int i, base, rbase, gbase, bbase, abase;
>      const int yoffs = fullRange ? 384 : 326;
>  
>      int64_t crv =  inv_table[0];
> @@ -659,12 +659,13 @@
>          rbase = base + (isRgb ? 16 : 0);
>          gbase = base + 8;
>          bbase = base + (isRgb ? 0 : 16);
> +        abase = (c->dstFormat == PIX_FMT_RGBA || c->dstFormat == PIX_FMT_BGRA) ? 24 : 0;
>          c->yuvTable = av_malloc(1024*3*4);
>          y_table32 = c->yuvTable;
>          yb = -(384<<16) - oy;
>          for (i = 0; i < 1024; i++) {
>              uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
> -            y_table32[i     ] = yval << rbase;
> +            y_table32[i     ] = (yval << rbase) + (255 << abase);
>              y_table32[i+1024] = yval << gbase;
>              y_table32[i+2048] = yval << bbase;
>              yb += cy;

5 hunks ok


[...]

> Index: ffmpeg/libswscale/yuv2rgb.c
> ===================================================================
> --- ffmpeg.orig/libswscale/yuv2rgb.c	2009-02-27 23:27:15.227630601 +0100
> +++ ffmpeg/libswscale/yuv2rgb.c	2009-02-27 23:27:23.171601482 +0100
> @@ -101,12 +101,18 @@
>      Y = src[2*i+1];                                          \
>      dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y];
>  
> -#define YUV2RGBFUNC(func_name, dst_type) \
> +#define PUTRGBA(dst,ysrc,asrc,i,o,s)                    \
> +    Y = ysrc[2*i+o];                                    \
> +    dst[2*i  ] = r[Y] + g[Y] + b[Y] + (asrc[2*i  ]<<s); \
> +    Y = ysrc[2*i+1-o];                                  \
> +    dst[2*i+1] = r[Y] + g[Y] + b[Y] + (asrc[2*i+1]<<s);
> +
> +#define YUV2RGBFUNC(func_name, dst_type, alpha) \
>  static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
>                       int srcSliceH, uint8_t* dst[], int dstStride[]){\
>      int y;\
>  \
> -    if (c->srcFormat == PIX_FMT_YUV422P) {\
> +    if (!alpha && c->srcFormat == PIX_FMT_YUV422P) {\
>          srcStride[1] *= 2;\
>          srcStride[2] *= 2;\
>      }\
> @@ -119,7 +125,12 @@
>          uint8_t *py_2 = py_1 + srcStride[0];\
>          uint8_t *pu = src[1] + (y>>1)*srcStride[1];\
>          uint8_t *pv = src[2] + (y>>1)*srcStride[2];\
> +        uint8_t av_unused *pa_1, *pa_2;\
>          unsigned int h_size = c->dstW>>3;\
> +        if (alpha){\
> +            pa_1 = src[3] + y*srcStride[3];\
> +            pa_2 = pa_1 + srcStride[3];\
> +        }\
>          while (h_size--) {\
>              int av_unused U, V;\
>              int Y;\
> @@ -145,7 +156,7 @@
>      ENDYUV2RGBLINE(dst_delta)\
>      ENDYUV2RGBFUNC()
>  
> -YUV2RGBFUNC(yuv2rgb_c_32, uint32_t)
> +YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
>      LOADCHROMA(0);
>      PUTRGB(dst_1,py_1,0,0);
>      PUTRGB(dst_2,py_2,0,1);
> @@ -174,7 +185,71 @@
>      PUTRGB(dst_1,py_1,1,0);
>  ENDYUV2RGBFUNC()
>  
> -YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t)
> +#if CONFIG_SWSCALE_ALPHA
> +YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
> +    LOADCHROMA(0);
> +    PUTRGBA(dst_1,py_1,pa_1,0,0,24);
> +    PUTRGBA(dst_2,py_2,pa_2,0,1,24);
> +
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_1,1,1,24);
> +    PUTRGBA(dst_1,py_1,pa_2,1,0,24);
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_1,1,1,24);
> +    PUTRGBA(dst_1,py_1,pa_2,1,0,24);
> +
> +    LOADCHROMA(2);
> +    PUTRGBA(dst_1,py_1,pa_1,2,0,24);
> +    PUTRGBA(dst_2,py_2,pa_2,2,1,24);
> +
> +    LOADCHROMA(3);
> +    PUTRGBA(dst_2,py_2,pa_1,3,1,24);
> +    PUTRGBA(dst_1,py_1,pa_2,3,0,24);
> +    pa_1 += 8;\
> +    pa_2 += 8;\
> +ENDYUV2RGBLINE(8)
> +    LOADCHROMA(0);
> +    PUTRGBA(dst_1,py_1,pa_1,0,0,24);
> +    PUTRGBA(dst_2,py_2,pa_2,0,1,24);
> +
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_1,1,1,24);
> +    PUTRGBA(dst_1,py_1,pa_2,1,0,24);
> +ENDYUV2RGBFUNC()
> +
> +YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
> +    LOADCHROMA(0);
> +    PUTRGBA(dst_1,py_1,pa_1,0,0,0);
> +    PUTRGBA(dst_2,py_2,pa_2,0,1,0);
> +
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_2,1,1,0);
> +    PUTRGBA(dst_1,py_1,pa_1,1,0,0);
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_2,1,1,0);
> +    PUTRGBA(dst_1,py_1,pa_1,1,0,0);
> +
> +    LOADCHROMA(2);
> +    PUTRGBA(dst_1,py_1,pa_1,2,0,0);
> +    PUTRGBA(dst_2,py_2,pa_2,2,1,0);
> +
> +    LOADCHROMA(3);
> +    PUTRGBA(dst_2,py_2,pa_2,3,1,0);
> +    PUTRGBA(dst_1,py_1,pa_1,3,0,0);
> +    pa_1 += 8;\
> +    pa_2 += 8;\
> +ENDYUV2RGBLINE(8)
> +    LOADCHROMA(0);
> +    PUTRGBA(dst_1,py_1,pa_1,0,0,0);
> +    PUTRGBA(dst_2,py_2,pa_2,0,1,0);
> +
> +    LOADCHROMA(1);
> +    PUTRGBA(dst_2,py_2,pa_2,1,1,0);
> +    PUTRGBA(dst_1,py_1,pa_1,1,0,0);
> +ENDYUV2RGBFUNC()
> +#endif
> +
> +YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
>      LOADCHROMA(0);
>      PUTRGB24(dst_1,py_1,0);
>      PUTRGB24(dst_2,py_2,0);
> @@ -201,7 +276,7 @@
>  ENDYUV2RGBFUNC()
>  
>  // only trivial mods from yuv2rgb_c_24_rgb
> -YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
>      LOADCHROMA(0);
>      PUTBGR24(dst_1,py_1,0);
>      PUTBGR24(dst_2,py_2,0);
> @@ -229,7 +304,7 @@
>  
>  // This is exactly the same code as yuv2rgb_c_32 except for the types of
>  // r, g, b, dst_1, dst_2
> -YUV2RGBFUNC(yuv2rgb_c_16, uint16_t)
> +YUV2RGBFUNC(yuv2rgb_c_16, uint16_t, 0)
>      LOADCHROMA(0);
>      PUTRGB(dst_1,py_1,0,0);
>      PUTRGB(dst_2,py_2,0,1);
> @@ -249,7 +324,7 @@
>  
>  // This is exactly the same code as yuv2rgb_c_32 except for the types of
>  // r, g, b, dst_1, dst_2
> -YUV2RGBFUNC(yuv2rgb_c_8, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_8, uint8_t, 0)
>      LOADCHROMA(0);
>      PUTRGB(dst_1,py_1,0,0);
>      PUTRGB(dst_2,py_2,0,1);
> @@ -268,7 +343,7 @@
>  CLOSEYUV2RGBFUNC(8)
>  
>  // r, g, b, dst_1, dst_2
> -YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
>      const uint8_t *d32 = dither_8x8_32[y&7];
>      const uint8_t *d64 = dither_8x8_73[y&7];
>  #define PUTRGB8(dst,src,i,o)                                    \
> @@ -297,7 +372,7 @@
>  
>  // This is exactly the same code as yuv2rgb_c_32 except for the types of
>  // r, g, b, dst_1, dst_2
> -YUV2RGBFUNC(yuv2rgb_c_4, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_4, uint8_t, 0)
>      int acc;
>  #define PUTRGB4(dst,src,i)          \
>      Y = src[2*i];                   \
> @@ -323,7 +398,7 @@
>      PUTRGB4(dst_1,py_1,3);
>  CLOSEYUV2RGBFUNC(4)
>  
> -YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
>      const uint8_t *d64 =  dither_8x8_73[y&7];
>      const uint8_t *d128 = dither_8x8_220[y&7];
>      int acc;
> @@ -354,7 +429,7 @@
>  
>  // This is exactly the same code as yuv2rgb_c_32 except for the types of
>  // r, g, b, dst_1, dst_2
> -YUV2RGBFUNC(yuv2rgb_c_4b, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_4b, uint8_t, 0)
>      LOADCHROMA(0);
>      PUTRGB(dst_1,py_1,0,0);
>      PUTRGB(dst_2,py_2,0,1);
> @@ -372,7 +447,7 @@
>      PUTRGB(dst_1,py_1,3,0);
>  CLOSEYUV2RGBFUNC(8)
>  
> -YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
>      const uint8_t *d64 =  dither_8x8_73[y&7];
>      const uint8_t *d128 = dither_8x8_220[y&7];
>  
> @@ -399,7 +474,7 @@
>      PUTRGB4DB(dst_1,py_1,3,6);
>  CLOSEYUV2RGBFUNC(8)
>  
> -YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t)
> +YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
>          const uint8_t *d128 = dither_8x8_220[y&7];
>          char out_1 = 0, out_2 = 0;
>          g= c->table_gU[128] + c->table_gV[128];
> @@ -432,7 +507,12 @@
>  #if (HAVE_MMX2 || HAVE_MMX) && CONFIG_GPL
>      if (c->flags & SWS_CPU_CAPS_MMX2) {
>          switch (c->dstFormat) {
> -        case PIX_FMT_RGB32:  return yuv420_rgb32_MMX2;
> +        case PIX_FMT_RGB32:
> +#if HAVE_7REGS
> +            return (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) ? yuva420_rgb32_MMX2 : yuv420_rgb32_MMX2;
> +#else
> +            if (!(CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P)) return yuv420_rgb32_MMX2;
> +#endif
>          case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
>          case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
>          case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
> @@ -440,7 +520,12 @@
>      }
>      if (c->flags & SWS_CPU_CAPS_MMX) {
>          switch (c->dstFormat) {
> -        case PIX_FMT_RGB32:  return yuv420_rgb32_MMX;
> +        case PIX_FMT_RGB32:
> +#if HAVE_7REGS
> +            return (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) ? yuva420_rgb32_MMX : yuv420_rgb32_MMX;
> +#else
> +            if (!(CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P)) return yuv420_rgb32_MMX;
> +#endif
>          case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
>          case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
>          case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
> @@ -468,6 +553,16 @@
>  
>      av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
>  
> +#if CONFIG_SWSCALE_ALPHA
> +    if (c->srcFormat == PIX_FMT_YUVA420P)
> +        switch(c->dstFormat){
> +        case PIX_FMT_ARGB:
> +        case PIX_FMT_ABGR: return yuva2argb_c;
> +        case PIX_FMT_RGBA:
> +        case PIX_FMT_BGRA: return yuva2rgba_c;
> +        }
> +#endif
> +
>      switch (c->dstFormat) {
>      case PIX_FMT_BGR32_1:
>      case PIX_FMT_RGB32_1:
> @@ -533,7 +628,7 @@
>      uint8_t *y_table;
>      uint16_t *y_table16;
>      uint32_t *y_table32;
> -    int i, base, rbase, gbase, bbase, abase;
> +    int i, base, rbase, gbase, bbase, abase, needAlpha;
>      const int yoffs = fullRange ? 384 : 326;
>  
>      int64_t crv =  inv_table[0];
> @@ -659,13 +754,15 @@
>          rbase = base + (isRgb ? 16 : 0);
>          gbase = base + 8;
>          bbase = base + (isRgb ? 0 : 16);
> -        abase = (c->dstFormat == PIX_FMT_RGBA || c->dstFormat == PIX_FMT_BGRA) ? 24 : 0;
> +        needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
> +        if (!needAlpha)
> +            abase = (c->dstFormat == PIX_FMT_RGBA || c->dstFormat == PIX_FMT_BGRA) ? 24 : 0;
>          c->yuvTable = av_malloc(1024*3*4);
>          y_table32 = c->yuvTable;
>          yb = -(384<<16) - oy;
>          for (i = 0; i < 1024; i++) {
>              uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
> -            y_table32[i     ] = (yval << rbase) + (255 << abase);
> +            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase));
>              y_table32[i+1024] = yval << gbase;
>              y_table32[i+2048] = yval << bbase;
>              yb += cy;
> Index: ffmpeg/libswscale/yuv2rgb_template.c
> ===================================================================
> --- ffmpeg.orig/libswscale/yuv2rgb_template.c	2009-02-27 23:27:15.227630601 +0100
> +++ ffmpeg/libswscale/yuv2rgb_template.c	2009-02-27 23:27:23.171601482 +0100
> @@ -162,7 +162,8 @@
>          "add $"AV_STRINGIFY(depth*8)", %1    \n\t" \
>          "add                       $4, %0    \n\t" \
>          " js                       1b        \n\t" \
> -\
> +
> +#define YUV2RGB_OPERANDS \
>          : "+r" (index), "+r" (image) \
>          : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \
>          ); \
> @@ -170,6 +171,14 @@
>      __asm__ volatile (EMMS); \
>      return srcSliceH; \
>  
> +#define YUV2RGB_OPERANDS_ALPHA \
> +        : "+r" (index), "+r" (image) \
> +        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index), "r" (pa - 2*index) \
> +        ); \
> +    } \
> +    __asm__ volatile (EMMS); \
> +    return srcSliceH; \
> +
>  static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
>                                         int srcSliceH, uint8_t* dst[], int dstStride[]){
>      int y, h_size;
> @@ -223,6 +232,7 @@
>          MOVNTQ "   %%mm5, 8 (%1);" /* store pixel 4-7 */
>  
>      YUV2RGB_ENDLOOP(2)
> +    YUV2RGB_OPERANDS
>  }
>  
>  static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
> @@ -280,6 +290,7 @@
>          MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
>  
>      YUV2RGB_ENDLOOP(2)
> +    YUV2RGB_OPERANDS
>  }
>  
>  static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
> @@ -394,6 +405,7 @@
>  #endif
>  
>      YUV2RGB_ENDLOOP(3)
> +    YUV2RGB_OPERANDS
>  }
>  
>  #define RGB_PLANAR2PACKED32                                             \
> @@ -450,4 +462,23 @@
>          RGB_PLANAR2PACKED32
>  
>      YUV2RGB_ENDLOOP(4)
> +    YUV2RGB_OPERANDS
> +}
> +
> +#if CONFIG_SWSCALE_ALPHA
> +static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
> +                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
> +    int y, h_size;
> +
> +    YUV2RGB_LOOP(4)
> +
> +        uint8_t *pa = src[3] + y*srcStride[3];
> +        YUV2RGB_INIT
> +        YUV2RGB
> +        "movq     (%6, %0, 2), %%mm3;"            /* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */
> +        RGB_PLANAR2PACKED32
> +
> +    YUV2RGB_ENDLOOP(4)
> +    YUV2RGB_OPERANDS_ALPHA
>  }
> +#endif

patch ok

will review las patch later

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

> ... defining _GNU_SOURCE...
For the love of all that is holy, and some that is not, don't do that.
-- Luca & Mans
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090228/853335f8/attachment.pgp>



More information about the ffmpeg-devel mailing list