[FFmpeg-devel] Extend/optimize RGB to RGB conversions funcsintorgb2rgb.c
yann.lepetitcorps at free.fr
yann.lepetitcorps at free.fr
Tue Sep 11 00:25:32 CEST 2012
And found this at
http://stackoverflow.com/questions/7194452/fast-vectorized-conversion-from-rgb-to-bgra
#include <tmmintrin.h>
/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
const __m128i *in_vec = in;
__m128i *out_vec = out;
w /= 16;
while (w-- > 0) {
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
* in_vec[0] Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
* in_vec[1] Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk Bk Rl Gl
* in_vec[2] Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp Rq Gq Bq
*/
__m128i in1, in2, in3;
__m128i out;
in1 = in_vec[0];
out = _mm_shuffle_epi8(in1,
_mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0,
1, 2));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
out_vec[0] = out;
in2 = in_vec[1];
in1 = _mm_and_si128(in1,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in2,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff));
out = _mm_or_si128(out, in1);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12,
13, 14));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
out_vec[1] = out;
in3 = in_vec[2];
in_vec += 3;
in2 = _mm_and_si128(in2,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in3,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff));
out = _mm_or_si128(out, in2);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff,
8, 9, 10));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
out_vec[2] = out;
out = _mm_shuffle_epi8(in3,
_mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9,
0xff, 4, 5, 6));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0,
0, 0));
out_vec[3] = out;
out_vec += 4;
}
}
but I have errors messages at compilation :(
gcc test_rgb2rgba.c -o test_rgb2rgba -O9
In file included from test_rgb2rgba.c:240:0:
/usr/lib/gcc/i686-linux-gnu/4.6/include/tmmintrin.h:31:3: erreur: #error "SSSE3
instruction set not enabled"
test_rgb2rgba.c: In function rgb_to_bgrx_sse:
test_rgb2rgba.c:245:5: erreur: unknown type name __m128i
test_rgb2rgba.c:246:5: erreur: unknown type name __m128i
test_rgb2rgba.c:256:9: erreur: unknown type name __m128i
test_rgb2rgba.c:257:9: erreur: unknown type name __m128i
make: *** [test_rgb2rgba] Erreur 1
But this seem howewer to a good start :)
@+
Yannoo
Selon yann.lepetitcorps at free.fr:
> I have tested with 2 versions that I have found on the net but the result is
> always the same ... less speed than the original :(
>
> void fast_unpack(const uint8_t* rgb, uint8_t* rgba, const int count) {
>
> int i, j;
>
> if(count==0)
> return;
> for( i=count; --i; rgba+=4, rgb+=3) {
> *(uint32_t*)(void*)rgba = *(const uint32_t*)(const void*)rgb;
> }
> for( j=0; j<3; ++j) {
> rgba[j] = rgb[j];
> }
> }
>
> void RGB8ToBGRX8(int w, const void *in, void *out)
> {
> int i;
> int width = w;
> const unsigned char *src= (const unsigned char*) in;
> unsigned int *dst= (unsigned int*) out;
> unsigned int invalue, outvalue;
>
> for (i=0; i<width; i++, src+=3, dst++)
> {
> invalue = src[0];
> outvalue = (invalue<<16);
> invalue = src[1];
> outvalue |= (invalue<<8);
> invalue = src[2];
> outvalue |= (invalue);
> *dst = outvalue | 0xff000000;
> }
> }
>
>
> The concerned part on my test procedure is here :
>
> printf("Test new rgb24to32() func : ");
> t2 = GetTimestamp();
> for( i = 0 ; i < NB_TESTS ; i++)
> {
> // rgb24to32_alpha((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS * 3,
> 255);
> // rgb24to32_uint32((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS * 3);
> // fast_unpack((uint8_t *)rgbTab, (uint8_t *)rgbaTab, NB_PIXELS);
> RGB8ToBGRX8(NB_PIXELS, rgbTab, rgbaTab);
> }
> t3 = GetTimestamp();
> printf( "%d ms \n", t3 - t2);
>
>
> @+
> Yannoo
>
> Selon yann.lepetitcorps at free.fr:
>
> > The pb don't seem very difficult to resolve because this is only the red
> and
> > blue components that are systematically interchanged
> >
> > But the gain is really too small (and sometimes negative) :(
> >
> > => I begin to think now that only MMX/SSE instructions (or vectorized
> > equivalents) can handle this more speedly ...
> >
> > ==> I take a look into my olds MMX source codes for to see if I find this
> :)
> > (on other side, I'm sure that this can be finded in the net ...)
> >
> >
> > @+
> > Yannoo
> >
> > Selon yann.lepetitcorps at free.fr:
> >
> > > Thanks for your contribution
> > >
> > > I have tested it and found that the conversion isn't valid :(
> > >
> > > void rgb24to32_uint32(const uint8_t *src, uint8_t *dst, int src_size )
> > > {
> > > int nPixels = src_size / 3;
> > > int pixels4 = nPixels >> 2;
> > > int extra = nPixels % 4;
> > > uint32_t * pDst = (uint32_t*)dst;
> > > uint32_t * pSrc = (uint32_t*)src;
> > > uint8_t* pBytes;
> > > int i;
> > >
> > > for ( i = 0; i < pixels4; ++i) {
> > > #if HAVE_BIGENDIAN
> > > pDst[0] = 0xFF000000 | (pSrc[0] >> 8);
> > > pDst[1] = 0xFF000000 | (pSrc[0] << 16) | (pSrc[1] >> 16);
> > > pDst[2] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[2] >> 24);
> > > pDst[3] = 0xFF000000 | pSrc[2];
> > > #else
> > > pDst[0] = 0xFF000000 | pSrc[0];
> > > pDst[1] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[0] >> 24);
> > > pDst[2] = 0xFF000000 | (pSrc[2] << 16) | (pSrc[1] >> 16);
> > > pDst[3] = 0xFF000000 | (pSrc[2] >> 8);
> > > #endif
> > > pDst +=4;
> > > pSrc +=3;
> > > }
> > >
> > > pBytes = (uint8_t*)pSrc;
> > >
> > > for ( i = 0; i < extra; i++) {
> > > #if HAVE_BIGENDIAN
> > > *pDst++ = 0xFF000000 | (pBytes[0] << 16) | (pBytes[1] << 8) |
> > > (pBytes[2]);
> > > #else
> > > *pDst++ = 0xFF000000 | (pBytes[2] << 16) | (pBytes[1] << 8) |
> > > (pBytes[0]);
> > > #endif
> > > pBytes += 3;
> > > }
> > > }
> > >
> > >
> > > Because it give this in my procedure test :
> > >
> > > Test original rgb24to32() func : 477 ms
> > > Test new rgb24to32() func : 474 ms
> > > R components of entry 0 aren't the sames (51 vs 223) :(
> > > B components of entry 0 aren't the sames (223 vs 51) :(
> > > R components of entry 1 aren't the sames (46 vs 50) :(
> > > B components of entry 1 aren't the sames (50 vs 46) :(
> > > R components of entry 2 aren't the sames (205 vs 188) :(
> > > B components of entry 2 aren't the sames (188 vs 205) :(
> > > R components of entry 3 aren't the sames (146 vs 87) :(
> > > B components of entry 3 aren't the sames (87 vs 146) :(
> > > R components of entry 4 aren't the sames (109 vs 35) :(
> > > B components of entry 4 aren't the sames (35 vs 109) :(
> > > R components of entry 5 aren't the sames (229 vs 92) :(
> > > B components of entry 5 aren't the sames (92 vs 229) :(
> > >
> > > => we have a very little gain but the conversion is false :(
> > > (my procedure test automatically exit when it found more than 10
> > errors)
> > > [but each loop work with 3x components tests, so this make 3x4 = 12
> > > errors
> > > before to automaticaly exit]
> > >
> > > My procedure test is outside my FFMPEG git repertory, so I put the source
> > > code
> > > of this test procedure as an attachment
> > >
> > >
> > > @+
> > > Yannoo
> > >
> > >
> > > Selon Don Moir <donmoir at comcast.net>:
> > >
> > > >
> > > > ----- Original Message -----
> > > > From: "Don Moir" <donmoir at comcast.net>
> > > > To: "FFmpeg development discussions and patches"
> > <ffmpeg-devel at ffmpeg.org>
> > > > Sent: Monday, September 10, 2012 3:48 PM
> > > > Subject: Re: [FFmpeg-devel] Extend/optimize RGB to RGB conversions
> > > > funcsintorgb2rgb.c
> > > >
> > > >
> > > > >> void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size )
> > > > >> {
> > > > >> int i;
> > > > >> uint8_t *psrc = src;
> > > > >>
> > > > >> for ( i = 0 ; i < src_size ; i += 3, psrc +=3, dst +=4 )
> > > > >> {
> > > > >> #if HAVE_BIGENDIAN
> > > > >> /* RGB24 (= R,G,B) -> BGR32 (= 255,R,G,B) */
> > > > >> dst[0] = 255;
> > > > >> dst[1] = psrc[0];
> > > > >> dst[2] = psrc[1];
> > > > >> dst[3] = psrc[2];
> > > > >> #else
> > > > >> dst[0] = psrc[2];
> > > > >> dst[1] = psrc[1];
> > > > >> dst[2] = psrc[0];
> > > > >> dst[3] = 255;
> > > > >> #endif
> > > > >> }
> > > > >> }
> > > > >
> > > > > You might try something like this that does 4 pixels within the loop.
> > It
> > > > > might be interesting to see if performance is better for this. I do
> it
> > > asm
> > > > > and don't do it line by line for my own purposes.
> > > > >
> > > > > Note: somewhat pseudo code. I do it differently so modified here.
> > > > >
> > > > > void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size )
> > > > > {
> > > > > int nPixels = src_size / 3;
> > > > > int pixels4 = nPixels >> 2;
> > > > > int extra = nPixels % 4;
> > > > > uint32_t* pDst (uint32_t*)dst;
> > > > > uint32_t* pSrc (uint32_t*)src;
> > > > >
> > > > > for (int i = 0; i < pixels4; ++i)
> > > > > {
> > > > > #if HAVE_BIGENDIAN
> > > > > pDst[0] = 0xFF000000 | (pSrc[0] >> 8);
> > > > > pDst[1] = 0xFF000000 | (pSrc[0] << 16) | (pSrc[1] >> 16);
> > > > > pDst[2] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[2] >> 24);
> > > > > pDst[3] = 0xFF000000 | pSrc[2];
> > > > > #else
> > > > > pDst[0] = 0xFF000000 | pSrc[0];
> > > > > pDst[1] = 0xFF000000 | (pSrc[1] << 8) | (pSrc[0] >> 24);
> > > > > pDst[2] = 0xFF000000 | ((pSrc[2] << 16) | (pSrc[1] >> 16);
> > > > > pDst[3] = 0xFF000000 | (pSrc[2] >> 8);
> > > > > #endif
> > > > > pDst +=4;
> > > > > pSrc +=3;
> > > > > }
> > > > >
> > > >
> > > > Sorry mistake:
> > > >
> > > > - uint8_t* pBytes = (uint8_t*)pDst;
> > > > + uint8_t* pBytes = (uint8_t*)pSrc;
> > > >
> > > > > for (int i = 0; i < extra; i++)
> > > > > {
> > > > > #if HAVE_BIGENDIAN
> > > > > *pDst++ = 0xFF000000 | (pBytes[0] << 16) | (pBytes[1] << 8) |
> > > > > (pBytes[2]);
> > > > > #else
> > > > > *pDst++ = 0xFF000000 | (pBytes[2] << 16) | (pBytes[1] << 8) |
> > > > > (pBytes[0]);
> > > > > #endif
> > > > > pBytes += 3;
> > > > > }
> > > > > }
> > > > > _______________________________________________
> > > > > ffmpeg-devel mailing list
> > > > > ffmpeg-devel at ffmpeg.org
> > > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > > >
> > > > _______________________________________________
> > > > ffmpeg-devel mailing list
> > > > ffmpeg-devel at ffmpeg.org
> > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > > >
> > >
> > >
> > >
> >
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel at ffmpeg.org
> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
More information about the ffmpeg-devel
mailing list