[MPlayer-dev-eng] altivec patch 5/5: don't output the wrong format
Alan Curry
pacman at TheWorld.com
Sat Feb 11 23:31:32 CET 2006
I wrote:
>
>OK, I can split altivec_yuv2packedX into separate functions, with a macro to
>declare them all with nearly-identical bodies, following the precedent set by
>DEFCSP420_CVT. The callers will then select the output format by calling the
>specific relevant function, which also gives us a chance to handle the
>formats that aren't supported in the altivec code by calling yuv2packedXinC
>instead.
Well, I tried 2 different variatns of this idea, turning the body of
altivec_yuv2packedX into a macro: one where there were actually 6 different
functions and one where the body was expanded 6 times in one function
controlled by a single switch(). That latter case was the macro that broke
the camel's back. gcc ate a gig and a half of memory and died. I could add
more swap, but seriously...
The first one did compile (although it took a long time since it had to
compile that heavy function 6 times), but it didn't give any measurable
improvement over the version with the switch() in the inner loop.
yuv2rgb_altivec.c in its current form is pretty hard on the compiler, using
almost a half gig of memory. That's mostly because of the hideous macros in
<altivec.h> (who really thought it was a good idea to sneak overloaded
functions into C?).
I don't seriously suggest these patches, but in case you want to see for
yourself, here they are.
-------------- next part --------------
--- yuv2rgb_altivec.c 2006-02-11 16:54:07.000000000 -0500
+++ yuv2rgb_altivec.c.try2 2006-02-11 16:29:50.000000000 -0500
@@ -767,6 +767,125 @@
return;
}
+#define YSCALE_YUV_2_RGBX_ALTIVEC(out_fmt) \
+ for(i=0; i<dstW; i+=16){\
+ Y0 = RND;\
+ Y1 = RND;\
+ /* extract 16 coeffs from lumSrc */\
+ for(j=0; j<lumFilterSize; j++) {\
+ X0 = vec_ld (0, &lumSrc[j][i]);\
+ X1 = vec_ld (16, &lumSrc[j][i]);\
+ Y0 = vec_mradds (X0, YCoeffs[j], Y0);\
+ Y1 = vec_mradds (X1, YCoeffs[j], Y1);\
+ }\
+ \
+ U = RND;\
+ V = RND;\
+ /* extract 8 coeffs from U,V */\
+ for(j=0; j<chrFilterSize; j++) {\
+ X = vec_ld (0, &chrSrc[j][i/2]);\
+ U = vec_mradds (X, CCoeffs[j], U);\
+ X = vec_ld (0, &chrSrc[j][i/2+2048]);\
+ V = vec_mradds (X, CCoeffs[j], V);\
+ }\
+ \
+ /* scale and clip signals */\
+ Y0 = vec_sra (Y0, SCL);\
+ Y1 = vec_sra (Y1, SCL);\
+ U = vec_sra (U, SCL);\
+ V = vec_sra (V, SCL);\
+ \
+ Y0 = vec_clip (Y0);\
+ Y1 = vec_clip (Y1);\
+ U = vec_clip (U);\
+ V = vec_clip (V);\
+ \
+ /* now we have\
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+ U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7\
+ \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+ U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7\
+ V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7\
+ */\
+ \
+ U0 = vec_mergeh (U,U);\
+ V0 = vec_mergeh (V,V);\
+ \
+ U1 = vec_mergel (U,U);\
+ V1 = vec_mergel (V,V);\
+ \
+ cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);\
+ cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);\
+ \
+ R = vec_packclp (R0,R1);\
+ G = vec_packclp (G0,G1);\
+ B = vec_packclp (B0,B1);\
+ \
+ out_fmt (R,G,B,out);\
+ }\
+ \
+ if (i < dstW) {\
+ i -= 16;\
+ \
+ Y0 = RND;\
+ Y1 = RND;\
+ /* extract 16 coeffs from lumSrc */\
+ for(j=0; j<lumFilterSize; j++) {\
+ X0 = vec_ld (0, &lumSrc[j][i]);\
+ X1 = vec_ld (16, &lumSrc[j][i]);\
+ Y0 = vec_mradds (X0, YCoeffs[j], Y0);\
+ Y1 = vec_mradds (X1, YCoeffs[j], Y1);\
+ }\
+ \
+ U = RND;\
+ V = RND;\
+ /* extract 8 coeffs from U,V */\
+ for(j=0; j<chrFilterSize; j++) {\
+ X = vec_ld (0, &chrSrc[j][i/2]);\
+ U = vec_mradds (X, CCoeffs[j], U);\
+ X = vec_ld (0, &chrSrc[j][i/2+2048]);\
+ V = vec_mradds (X, CCoeffs[j], V);\
+ }\
+ \
+ /* scale and clip signals */\
+ Y0 = vec_sra (Y0, SCL);\
+ Y1 = vec_sra (Y1, SCL);\
+ U = vec_sra (U, SCL);\
+ V = vec_sra (V, SCL);\
+ \
+ Y0 = vec_clip (Y0);\
+ Y1 = vec_clip (Y1);\
+ U = vec_clip (U);\
+ V = vec_clip (V);\
+ \
+ /* now we have\
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+ U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7\
+ \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+ U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7\
+ V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7\
+ */\
+ \
+ U0 = vec_mergeh (U,U);\
+ V0 = vec_mergeh (V,V);\
+ \
+ U1 = vec_mergel (U,U);\
+ V1 = vec_mergel (V,V);\
+ \
+ cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);\
+ cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);\
+ \
+ R = vec_packclp (R0,R1);\
+ G = vec_packclp (G0,G1);\
+ B = vec_packclp (B0,B1);\
+ \
+ nout = (vector unsigned char *)scratch;\
+ out_fmt (R,G,B,nout);\
+ \
+ memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);\
+ }
void
altivec_yuv2packedX (SwsContext *c,
@@ -793,153 +912,23 @@
out = (vector unsigned char *)dest;
- for(i=0; i<dstW; i+=16){
- Y0 = RND;
- Y1 = RND;
- /* extract 16 coeffs from lumSrc */
- for(j=0; j<lumFilterSize; j++) {
- X0 = vec_ld (0, &lumSrc[j][i]);
- X1 = vec_ld (16, &lumSrc[j][i]);
- Y0 = vec_mradds (X0, YCoeffs[j], Y0);
- Y1 = vec_mradds (X1, YCoeffs[j], Y1);
- }
-
- U = RND;
- V = RND;
- /* extract 8 coeffs from U,V */
- for(j=0; j<chrFilterSize; j++) {
- X = vec_ld (0, &chrSrc[j][i/2]);
- U = vec_mradds (X, CCoeffs[j], U);
- X = vec_ld (0, &chrSrc[j][i/2+2048]);
- V = vec_mradds (X, CCoeffs[j], V);
- }
-
- /* scale and clip signals */
- Y0 = vec_sra (Y0, SCL);
- Y1 = vec_sra (Y1, SCL);
- U = vec_sra (U, SCL);
- V = vec_sra (V, SCL);
-
- Y0 = vec_clip (Y0);
- Y1 = vec_clip (Y1);
- U = vec_clip (U);
- V = vec_clip (V);
-
- /* now we have
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
-
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
- V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
- */
-
- U0 = vec_mergeh (U,U);
- V0 = vec_mergeh (V,V);
-
- U1 = vec_mergel (U,U);
- V1 = vec_mergel (V,V);
-
- cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
- cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
- R = vec_packclp (R0,R1);
- G = vec_packclp (G0,G1);
- B = vec_packclp (B0,B1);
-
- switch(c->dstFormat) {
- case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
- case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
- case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
- case IMGFMT_ARGB: out_argb (R,G,B,out); break;
- case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
- case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
- default:
- {
- /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
- static int printed_error_message;
- if(!printed_error_message) {
- MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
- vo_format_name(c->dstFormat));
- printed_error_message=1;
- }
- return;
- }
- }
- }
-
- if (i < dstW) {
- i -= 16;
-
- Y0 = RND;
- Y1 = RND;
- /* extract 16 coeffs from lumSrc */
- for(j=0; j<lumFilterSize; j++) {
- X0 = vec_ld (0, &lumSrc[j][i]);
- X1 = vec_ld (16, &lumSrc[j][i]);
- Y0 = vec_mradds (X0, YCoeffs[j], Y0);
- Y1 = vec_mradds (X1, YCoeffs[j], Y1);
- }
-
- U = RND;
- V = RND;
- /* extract 8 coeffs from U,V */
- for(j=0; j<chrFilterSize; j++) {
- X = vec_ld (0, &chrSrc[j][i/2]);
- U = vec_mradds (X, CCoeffs[j], U);
- X = vec_ld (0, &chrSrc[j][i/2+2048]);
- V = vec_mradds (X, CCoeffs[j], V);
- }
-
- /* scale and clip signals */
- Y0 = vec_sra (Y0, SCL);
- Y1 = vec_sra (Y1, SCL);
- U = vec_sra (U, SCL);
- V = vec_sra (V, SCL);
-
- Y0 = vec_clip (Y0);
- Y1 = vec_clip (Y1);
- U = vec_clip (U);
- V = vec_clip (V);
-
- /* now we have
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
-
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
- V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
- */
-
- U0 = vec_mergeh (U,U);
- V0 = vec_mergeh (V,V);
-
- U1 = vec_mergel (U,U);
- V1 = vec_mergel (V,V);
-
- cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
- cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
- R = vec_packclp (R0,R1);
- G = vec_packclp (G0,G1);
- B = vec_packclp (B0,B1);
-
- nout = (vector unsigned char *)scratch;
- switch(c->dstFormat) {
- case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
- case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
- case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
- case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
- case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
- case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
- default:
- /* Unreachable, I think. */
- MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
- vo_format_name(c->dstFormat));
- return;
- }
-
- memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
+ switch(c->dstFormat) {
+ case IMGFMT_ABGR: YSCALE_YUV_2_RGBX_ALTIVEC(out_abgr); break;
+ case IMGFMT_BGRA: YSCALE_YUV_2_RGBX_ALTIVEC(out_bgra); break;
+ case IMGFMT_RGBA: YSCALE_YUV_2_RGBX_ALTIVEC(out_rgba); break;
+ case IMGFMT_ARGB: YSCALE_YUV_2_RGBX_ALTIVEC(out_argb); break;
+ case IMGFMT_RGB24: YSCALE_YUV_2_RGBX_ALTIVEC(out_rgb24); break;
+ case IMGFMT_BGR24: YSCALE_YUV_2_RGBX_ALTIVEC(out_bgr24); break;
+ default:
+ {
+ /* FIXME: possibly call yuv2packedXinC here */
+ static int printed_error_message;
+ if(!printed_error_message) {
+ MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
+ vo_format_name(c->dstFormat));
+ printed_error_message=1;
+ }
+ return;
+ }
}
-
}
-------------- next part --------------
--- yuv2rgb_altivec.c 2006-02-11 16:54:07.000000000 -0500
+++ yuv2rgb_altivec.c.try1 2006-02-11 16:53:57.000000000 -0500
@@ -767,6 +767,158 @@
return;
}
+#define DEFCSP420_CVT_SCALED(name,out_pixels) \
+static int altivec_scaled_##name (SwsContext *c, \
+ int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, \
+ int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, \
+ uint8_t *dest, int dstW, int dstY) \
+{ \
+ int i,j; \
+ short *f; \
+ vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; \
+ vector signed short R0,G0,B0,R1,G1,B1; \
+ \
+ vector unsigned char R,G,B,pels[3]; \
+ vector unsigned char *out,*nout; \
+ \
+ vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0); \
+ vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0); \
+ unsigned long scratch[16] __attribute__ ((aligned (16))); \
+ \
+ vector signed short *YCoeffs, *CCoeffs; \
+ \
+ YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; \
+ CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; \
+ \
+ out = (vector unsigned char *)dest; \
+ \
+ for(i=0; i<dstW; i+=16){ \
+ Y0 = RND; \
+ Y1 = RND; \
+ /* extract 16 coeffs from lumSrc */ \
+ for(j=0; j<lumFilterSize; j++) { \
+ X0 = vec_ld (0, &lumSrc[j][i]); \
+ X1 = vec_ld (16, &lumSrc[j][i]); \
+ Y0 = vec_mradds (X0, YCoeffs[j], Y0); \
+ Y1 = vec_mradds (X1, YCoeffs[j], Y1); \
+ } \
+ \
+ U = RND; \
+ V = RND; \
+ /* extract 8 coeffs from U,V */ \
+ for(j=0; j<chrFilterSize; j++) { \
+ X = vec_ld (0, &chrSrc[j][i/2]); \
+ U = vec_mradds (X, CCoeffs[j], U); \
+ X = vec_ld (0, &chrSrc[j][i/2+2048]); \
+ V = vec_mradds (X, CCoeffs[j], V); \
+ } \
+ \
+ /* scale and clip signals */ \
+ Y0 = vec_sra (Y0, SCL); \
+ Y1 = vec_sra (Y1, SCL); \
+ U = vec_sra (U, SCL); \
+ V = vec_sra (V, SCL); \
+ \
+ Y0 = vec_clip (Y0); \
+ Y1 = vec_clip (Y1); \
+ U = vec_clip (U); \
+ V = vec_clip (V); \
+ \
+ /* now we have \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 \
+ U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 \
+ \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 \
+ U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 \
+ V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 \
+ */ \
+ \
+ U0 = vec_mergeh (U,U); \
+ V0 = vec_mergeh (V,V); \
+ \
+ U1 = vec_mergel (U,U); \
+ V1 = vec_mergel (V,V); \
+ \
+ cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); \
+ cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); \
+ \
+ R = vec_packclp (R0,R1); \
+ G = vec_packclp (G0,G1); \
+ B = vec_packclp (B0,B1); \
+ \
+ out_pixels(R,G,B,out); \
+ } \
+ \
+ if (i < dstW) { \
+ i -= 16; \
+ \
+ Y0 = RND; \
+ Y1 = RND; \
+ /* extract 16 coeffs from lumSrc */ \
+ for(j=0; j<lumFilterSize; j++) { \
+ X0 = vec_ld (0, &lumSrc[j][i]); \
+ X1 = vec_ld (16, &lumSrc[j][i]); \
+ Y0 = vec_mradds (X0, YCoeffs[j], Y0); \
+ Y1 = vec_mradds (X1, YCoeffs[j], Y1); \
+ } \
+ \
+ U = RND; \
+ V = RND; \
+ /* extract 8 coeffs from U,V */ \
+ for(j=0; j<chrFilterSize; j++) { \
+ X = vec_ld (0, &chrSrc[j][i/2]); \
+ U = vec_mradds (X, CCoeffs[j], U); \
+ X = vec_ld (0, &chrSrc[j][i/2+2048]); \
+ V = vec_mradds (X, CCoeffs[j], V); \
+ } \
+ \
+ /* scale and clip signals */ \
+ Y0 = vec_sra (Y0, SCL); \
+ Y1 = vec_sra (Y1, SCL); \
+ U = vec_sra (U, SCL); \
+ V = vec_sra (V, SCL); \
+ \
+ Y0 = vec_clip (Y0); \
+ Y1 = vec_clip (Y1); \
+ U = vec_clip (U); \
+ V = vec_clip (V); \
+ \
+ /* now we have \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 \
+ U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 \
+ \
+ Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 \
+ U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 \
+ V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 \
+ */ \
+ \
+ U0 = vec_mergeh (U,U); \
+ V0 = vec_mergeh (V,V); \
+ \
+ U1 = vec_mergel (U,U); \
+ V1 = vec_mergel (V,V); \
+ \
+ cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); \
+ cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); \
+ \
+ R = vec_packclp (R0,R1); \
+ G = vec_packclp (G0,G1); \
+ B = vec_packclp (B0,B1); \
+ \
+ nout = (vector unsigned char *)scratch; \
+ out_pixels(R,G,B,nout); \
+ \
+ memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); \
+ } \
+ \
+}
+
+DEFCSP420_CVT_SCALED (yuv2_abgr32, out_abgr)
+DEFCSP420_CVT_SCALED (yuv2_bgra32, out_argb)
+DEFCSP420_CVT_SCALED (yuv2_rgba32, out_rgba)
+DEFCSP420_CVT_SCALED (yuv2_argb32, out_argb)
+DEFCSP420_CVT_SCALED (yuv2_rgb24, out_rgb24)
+DEFCSP420_CVT_SCALED (yuv2_bgr24, out_bgr24)
void
altivec_yuv2packedX (SwsContext *c,
@@ -774,172 +926,47 @@
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY)
{
- int i,j;
- short *f;
- vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
- vector signed short R0,G0,B0,R1,G1,B1;
-
- vector unsigned char R,G,B,pels[3];
- vector unsigned char *out,*nout;
-
- vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0);
- vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
- unsigned long scratch[16] __attribute__ ((aligned (16)));
-
- vector signed short *YCoeffs, *CCoeffs;
-
- YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
- CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
-
- out = (vector unsigned char *)dest;
-
- for(i=0; i<dstW; i+=16){
- Y0 = RND;
- Y1 = RND;
- /* extract 16 coeffs from lumSrc */
- for(j=0; j<lumFilterSize; j++) {
- X0 = vec_ld (0, &lumSrc[j][i]);
- X1 = vec_ld (16, &lumSrc[j][i]);
- Y0 = vec_mradds (X0, YCoeffs[j], Y0);
- Y1 = vec_mradds (X1, YCoeffs[j], Y1);
- }
-
- U = RND;
- V = RND;
- /* extract 8 coeffs from U,V */
- for(j=0; j<chrFilterSize; j++) {
- X = vec_ld (0, &chrSrc[j][i/2]);
- U = vec_mradds (X, CCoeffs[j], U);
- X = vec_ld (0, &chrSrc[j][i/2+2048]);
- V = vec_mradds (X, CCoeffs[j], V);
- }
-
- /* scale and clip signals */
- Y0 = vec_sra (Y0, SCL);
- Y1 = vec_sra (Y1, SCL);
- U = vec_sra (U, SCL);
- V = vec_sra (V, SCL);
-
- Y0 = vec_clip (Y0);
- Y1 = vec_clip (Y1);
- U = vec_clip (U);
- V = vec_clip (V);
-
- /* now we have
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
-
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
- V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
- */
-
- U0 = vec_mergeh (U,U);
- V0 = vec_mergeh (V,V);
-
- U1 = vec_mergel (U,U);
- V1 = vec_mergel (V,V);
-
- cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
- cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
- R = vec_packclp (R0,R1);
- G = vec_packclp (G0,G1);
- B = vec_packclp (B0,B1);
-
- switch(c->dstFormat) {
- case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
- case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
- case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
- case IMGFMT_ARGB: out_argb (R,G,B,out); break;
- case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
- case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
- default:
- {
- /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
- static int printed_error_message;
- if(!printed_error_message) {
- MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
- vo_format_name(c->dstFormat));
- printed_error_message=1;
- }
- return;
- }
- }
- }
-
- if (i < dstW) {
- i -= 16;
-
- Y0 = RND;
- Y1 = RND;
- /* extract 16 coeffs from lumSrc */
- for(j=0; j<lumFilterSize; j++) {
- X0 = vec_ld (0, &lumSrc[j][i]);
- X1 = vec_ld (16, &lumSrc[j][i]);
- Y0 = vec_mradds (X0, YCoeffs[j], Y0);
- Y1 = vec_mradds (X1, YCoeffs[j], Y1);
- }
-
- U = RND;
- V = RND;
- /* extract 8 coeffs from U,V */
- for(j=0; j<chrFilterSize; j++) {
- X = vec_ld (0, &chrSrc[j][i/2]);
- U = vec_mradds (X, CCoeffs[j], U);
- X = vec_ld (0, &chrSrc[j][i/2+2048]);
- V = vec_mradds (X, CCoeffs[j], V);
- }
-
- /* scale and clip signals */
- Y0 = vec_sra (Y0, SCL);
- Y1 = vec_sra (Y1, SCL);
- U = vec_sra (U, SCL);
- V = vec_sra (V, SCL);
-
- Y0 = vec_clip (Y0);
- Y1 = vec_clip (Y1);
- U = vec_clip (U);
- V = vec_clip (V);
-
- /* now we have
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
-
- Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
- U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
- V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
- */
-
- U0 = vec_mergeh (U,U);
- V0 = vec_mergeh (V,V);
-
- U1 = vec_mergel (U,U);
- V1 = vec_mergel (V,V);
-
- cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
- cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
- R = vec_packclp (R0,R1);
- G = vec_packclp (G0,G1);
- B = vec_packclp (B0,B1);
-
- nout = (vector unsigned char *)scratch;
- switch(c->dstFormat) {
- case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
- case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
- case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
- case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
- case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
- case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
- default:
- /* Unreachable, I think. */
- MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
- vo_format_name(c->dstFormat));
- return;
- }
-
- memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
+ switch(c->dstFormat) {
+ case IMGFMT_ABGR:
+ altivec_scaled_yuv2_abgr32(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ case IMGFMT_BGRA:
+ altivec_scaled_yuv2_bgra32(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ case IMGFMT_RGBA:
+ altivec_scaled_yuv2_rgba32(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ case IMGFMT_ARGB:
+ altivec_scaled_yuv2_argb32(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ case IMGFMT_RGB24:
+ altivec_scaled_yuv2_rgb24(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ case IMGFMT_BGR24:
+ altivec_scaled_yuv2_bgr24(c,
+ lumFilter, lumSrc, lumFilterSize,
+ chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+ break;
+ default:
+ {
+ /* FIXME: either write more converters or punt to yuv2packedXinC */
+ static int printed_error_message;
+ if(!printed_error_message) {
+ MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
+ vo_format_name(c->dstFormat));
+ printed_error_message=1;
+ }
+ return;
+ }
}
-
}
More information about the MPlayer-dev-eng
mailing list