[FFmpeg-devel] Extend/optimize RGB to RGB conversions funcsintorgb2rgb.c

Tue Sep 11 00:25:32 CEST 2012

And found this at
http://stackoverflow.com/questions/7194452/fast-vectorized-conversion-from-rgb-to-bgra

#include <tmmintrin.h>

/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
    const __m128i *in_vec = in;
    __m128i *out_vec = out;

    w /= 16;

    while (w-- > 0) {
        /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
         * in_vec[1]   Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk Bk Rl Gl
         * in_vec[2]   Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp Rq Gq Bq
         */
        __m128i in1, in2, in3;
        __m128i out;

        in1 = in_vec[0];

        out = _mm_shuffle_epi8(in1,
            _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0,
1, 2));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
        out_vec[0] = out;

        in2 = in_vec[1];

        in1 = _mm_and_si128(in1,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in2,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in1);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12,
13, 14));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
        out_vec[1] = out;

        in3 = in_vec[2];
        in_vec += 3;

        in2 = _mm_and_si128(in2,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in3,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in2);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff,
8, 9, 10));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
        out_vec[2] = out;

        out = _mm_shuffle_epi8(in3,
            _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9,
0xff, 4, 5, 6));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
        out_vec[3] = out;

        out_vec += 4;
    }
}

but I have errors messages at compilation :(

gcc test_rgb2rgba.c -o test_rgb2rgba -O9
In file included from test_rgb2rgba.c:240:0:
/usr/lib/gcc/i686-linux-gnu/4.6/include/tmmintrin.h:31:3: erreur: #error "SSSE3
instruction set not enabled"
test_rgb2rgba.c: In function ‘rgb_to_bgrx_sse’:
test_rgb2rgba.c:245:5: erreur: unknown type name ‘__m128i’
test_rgb2rgba.c:246:5: erreur: unknown type name ‘__m128i’
test_rgb2rgba.c:256:9: erreur: unknown type name ‘__m128i’
test_rgb2rgba.c:257:9: erreur: unknown type name ‘__m128i’
make: *** [test_rgb2rgba] Erreur 1

But this seem howewer to a good start :)

@+
Yannoo

Selon yann.lepetitcorps at free.fr:

> I have tested with 2 versions that I have found on the net but the result is
> always the same ... less speed than the original :(
>
> void fast_unpack(const uint8_t* rgb, uint8_t* rgba, const int count) {
>
>     int i, j;
>
>     if(count==0)
>         return;
>     for( i=count; --i; rgba+=4, rgb+=3) {
>         *(uint32_t*)(void*)rgba = *(const uint32_t*)(const void*)rgb;
>     }
>     for( j=0; j<3; ++j) {
>         rgba[j] = rgb[j];
>     }
> }
>
> void RGB8ToBGRX8(int w, const void *in, void *out)
>     {
>         int i;
>         int width = w;
>         const unsigned char *src= (const unsigned char*) in;
>         unsigned int *dst= (unsigned int*) out;
>         unsigned int invalue, outvalue;
>
>         for (i=0; i<width; i++, src+=3, dst++)
>         {
>                 invalue = src[0];
>                 outvalue = (invalue<<16);
>                 invalue = src[1];
>                 outvalue |= (invalue<<8);
>                 invalue = src[2];
>                 outvalue |= (invalue);
>                 *dst = outvalue | 0xff000000;
>         }
>       }
>
>
> The concerned part on my test procedure is here :
>
> 	printf("Test new rgb24to32() func : ");
> 	t2 = GetTimestamp();
> 	for( i = 0 ; i < NB_TESTS ; i++)
> 	{
> 		// rgb24to32_alpha((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS * 3,
> 255);
> 		// rgb24to32_uint32((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS * 3);
> 		// fast_unpack((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS);
> 		RGB8ToBGRX8(NB_PIXELS, rgbTab, rgbaTab);
> 	}
> 	t3 = GetTimestamp();
> 	printf( "%d ms \n", t3 - t2);
>
>
> @+
> Yannoo
>
> Selon yann.lepetitcorps at free.fr:
>
> > The pb don't seem very difficult to resolve because this is only the red
> and
> > blue components that are systematically interchanged
> >
> > But the gain is really too small (and sometimes negative) :(
> >
> > => I begin to think now that only MMX/SSE instructions (or vectorized
> > equivalents)  can handle this more speedly ...
> >
> > ==> I take a look into my olds MMX source codes for to see if I find this
> :)
> >     (on other side, I'm sure that this can be finded in the net ...)
> >
> >
> > @+
> > Yannoo
> >
> > Selon yann.lepetitcorps at free.fr:
> >
> > > Thanks for your contribution
> > >
> > > I have tested it and found that the conversion isn't valid :(
> > >
> > > void rgb24to32_uint32(const uint8_t *src, uint8_t *dst, int src_size )
> > > {
> > >    int nPixels = src_size / 3;
> > >    int pixels4 = nPixels >> 2;
> > >    int extra = nPixels % 4;
> > >    uint32_t * pDst =  (uint32_t*)dst;
> > >    uint32_t * pSrc = (uint32_t*)src;
> > >    uint8_t* pBytes;
> > >    int i;
> > >
> > >    for ( i = 0; i < pixels4; ++i)    {
> > > #if HAVE_BIGENDIAN
> > >        pDst[0] = 0xFF000000 | (pSrc[0] >> 8);
> > >        pDst[1] = 0xFF000000 | (pSrc[0] << 16) | (pSrc[1] >> 16);
> > >        pDst[2] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[2] >> 24);
> > >        pDst[3] = 0xFF000000 | pSrc[2];
> > > #else
> > >        pDst[0] = 0xFF000000 | pSrc[0];
> > >        pDst[1] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[0] >> 24);
> > >        pDst[2] = 0xFF000000 | (pSrc[2] << 16) | (pSrc[1] >> 16);
> > >        pDst[3] = 0xFF000000 | (pSrc[2] >> 8);
> > > #endif
> > >        pDst +=4;
> > >        pSrc +=3;
> > >     }
> > >
> > >     pBytes = (uint8_t*)pSrc;
> > >
> > >   for ( i = 0; i < extra; i++)    {
> > > #if HAVE_BIGENDIAN
> > >        *pDst++ = 0xFF000000 | (pBytes[0] << 16) | (pBytes[1] << 8) |
> > > (pBytes[2]);
> > > #else
> > >        *pDst++ = 0xFF000000 | (pBytes[2] << 16) | (pBytes[1] << 8) |
> > > (pBytes[0]);
> > > #endif
> > >        pBytes += 3;
> > >    }
> > > }
> > >
> > >
> > > Because it give this in my procedure test :
> > >
> > > Test original rgb24to32() func : 477 ms
> > > Test new rgb24to32() func : 474 ms
> > > R components of entry 0 aren't the sames (51 vs 223) :(
> > > B components of entry 0 aren't the sames (223 vs 51) :(
> > > R components of entry 1 aren't the sames (46 vs 50) :(
> > > B components of entry 1 aren't the sames (50 vs 46) :(
> > > R components of entry 2 aren't the sames (205 vs 188) :(
> > > B components of entry 2 aren't the sames (188 vs 205) :(
> > > R components of entry 3 aren't the sames (146 vs 87) :(
> > > B components of entry 3 aren't the sames (87 vs 146) :(
> > > R components of entry 4 aren't the sames (109 vs 35) :(
> > > B components of entry 4 aren't the sames (35 vs 109) :(
> > > R components of entry 5 aren't the sames (229 vs 92) :(
> > > B components of entry 5 aren't the sames (92 vs 229) :(
> > >
> > > => we have a very little gain but the conversion is false :(
> > >     (my procedure test automatically exit when it found more than 10
> > errors)
> > >     [but each loop work with 3x components tests, so this make 3x4 = 12
> > > errors
> > > before to automaticaly  exit]
> > >
> > > My procedure test is outside my FFMPEG git repertory, so I put the source
> > > code
> > > of this test procedure as an attachment
> > >
> > >
> > > @+
> > > Yannoo
> > >
> > >
> > > Selon Don Moir <donmoir at comcast.net>:
> > >
> > > >
> > > > ----- Original Message -----
> > > > From: "Don Moir" <donmoir at comcast.net>
> > > > To: "FFmpeg development discussions and patches"
> > <ffmpeg-devel at ffmpeg.org>
> > > > Sent: Monday, September 10, 2012 3:48 PM
> > > > Subject: Re: [FFmpeg-devel] Extend/optimize RGB to RGB conversions
> > > > funcsintorgb2rgb.c
> > > >
> > > >
> > > > >> void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size )
> > > > >> {
> > > > >>    int i;
> > > > >>    uint8_t *psrc = src;
> > > > >>
> > > > >>    for ( i = 0 ; i < src_size ; i += 3, psrc +=3, dst +=4 )
> > > > >>    {
> > > > >> #if HAVE_BIGENDIAN
> > > > >>        /* RGB24 (= R,G,B) -> BGR32 (= 255,R,G,B) */
> > > > >>        dst[0] = 255;
> > > > >>        dst[1] = psrc[0];
> > > > >>        dst[2] = psrc[1];
> > > > >>        dst[3] = psrc[2];
> > > > >> #else
> > > > >>        dst[0] = psrc[2];
> > > > >>        dst[1] = psrc[1];
> > > > >>        dst[2] = psrc[0];
> > > > >>        dst[3] = 255;
> > > > >> #endif
> > > > >>    }
> > > > >> }
> > > > >
> > > > > You might try something like this that does 4 pixels within the loop.
> > It
> > > > > might be interesting to see if performance is better for this. I do
> it
> > > asm
> > > > > and don't do it line by line for my own purposes.
> > > > >
> > > > > Note: somewhat pseudo code. I do it differently so modified here.
> > > > >
> > > > > void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size )
> > > > > {
> > > > >    int nPixels = src_size / 3;
> > > > >    int pixels4 = nPixels >> 2;
> > > > >    int extra = nPixels % 4;
> > > > >    uint32_t* pDst (uint32_t*)dst;
> > > > >    uint32_t* pSrc (uint32_t*)src;
> > > > >
> > > > >    for (int i = 0; i < pixels4; ++i)
> > > > >    {
> > > > > #if HAVE_BIGENDIAN
> > > > >        pDst[0] = 0xFF000000 | (pSrc[0] >> 8);
> > > > >        pDst[1] = 0xFF000000 | (pSrc[0] << 16) | (pSrc[1] >> 16);
> > > > >        pDst[2] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[2] >> 24);
> > > > >        pDst[3] = 0xFF000000 | pSrc[2];
> > > > > #else
> > > > >        pDst[0] = 0xFF000000 | pSrc[0];
> > > > >        pDst[1] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[0] >> 24);
> > > > >        pDst[2] = 0xFF000000 | ((pSrc[2] << 16) | (pSrc[1] >> 16);
> > > > >        pDst[3] = 0xFF000000 | (pSrc[2] >> 8);
> > > > > #endif
> > > > >        pDst +=4;
> > > > >        pSrc +=3;
> > > > >    }
> > > > >
> > > >
> > > > Sorry mistake:
> > > >
> > > > -    uint8_t* pBytes = (uint8_t*)pDst;
> > > > +   uint8_t* pBytes = (uint8_t*)pSrc;
> > > >
> > > > >    for (int i = 0; i < extra; i++)
> > > > >    {
> > > > > #if HAVE_BIGENDIAN
> > > > >        *pDst++ = 0xFF000000 | (pBytes[0] << 16) | (pBytes[1] << 8) |
> > > > > (pBytes[2]);
> > > > > #else
> > > > >        *pDst++ = 0xFF000000 | (pBytes[2] << 16) | (pBytes[1] << 8) |
> > > > > (pBytes[0]);
> > > > > #endif
> > > > >        pBytes += 3;
> > > > >    }
> > > > > }
> > > > > _______________________________________________
> > > > > ffmpeg-devel mailing list
> > > > > ffmpeg-devel at ffmpeg.org
> > > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > > >
> > > > _______________________________________________
> > > > ffmpeg-devel mailing list
> > > > ffmpeg-devel at ffmpeg.org
> > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > > >
> > >
> > >
> > >
> >
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel at ffmpeg.org
> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>