[MPlayer-dev-eng] altivec patch 5/5: don't output the wrong format

Alan Curry pacman at TheWorld.com
Sat Feb 11 23:31:32 CET 2006


I wrote:
>
>OK, I can split altivec_yuv2packedX into separate functions, with a macro to
>declare them all with nearly-identical bodies, following the precedent set by
>DEFCSP420_CVT. The callers will then select the output format by calling the
>specific relevant function, which also gives us a chance to handle the
>formats that aren't supported in the altivec code by calling yuv2packedXinC
>instead.

Well, I tried 2 different variatns of this idea, turning the body of
altivec_yuv2packedX into a macro: one where there were actually 6 different
functions and one where the body was expanded 6 times in one function
controlled by a single switch(). That latter case was the macro that broke
the camel's back. gcc ate a gig and a half of memory and died. I could add
more swap, but seriously...

The first one did compile (although it took a long time since it had to
compile that heavy function 6 times), but it didn't give any measurable
improvement over the version with the switch() in the inner loop.

yuv2rgb_altivec.c in its current form is pretty hard on the compiler, using
almost a half gig of memory. That's mostly because of the hideous macros in
<altivec.h> (who really thought it was a good idea to sneak overloaded
functions into C?).

I don't seriously suggest these patches, but in case you want to see for
yourself, here they are.

-------------- next part --------------
--- yuv2rgb_altivec.c	2006-02-11 16:54:07.000000000 -0500
+++ yuv2rgb_altivec.c.try2	2006-02-11 16:29:50.000000000 -0500
@@ -767,6 +767,125 @@
  return;
 }
 
+#define YSCALE_YUV_2_RGBX_ALTIVEC(out_fmt) \
+  for(i=0; i<dstW; i+=16){\
+    Y0 = RND;\
+    Y1 = RND;\
+    /* extract 16 coeffs from lumSrc */\
+    for(j=0; j<lumFilterSize; j++) {\
+      X0 = vec_ld (0,  &lumSrc[j][i]);\
+      X1 = vec_ld (16, &lumSrc[j][i]);\
+      Y0 = vec_mradds (X0, YCoeffs[j], Y0);\
+      Y1 = vec_mradds (X1, YCoeffs[j], Y1);\
+    }\
+    \
+    U = RND;\
+    V = RND;\
+    /* extract 8 coeffs from U,V */\
+    for(j=0; j<chrFilterSize; j++) {\
+      X  = vec_ld (0, &chrSrc[j][i/2]);\
+      U  = vec_mradds (X, CCoeffs[j], U);\
+      X  = vec_ld (0, &chrSrc[j][i/2+2048]);\
+      V  = vec_mradds (X, CCoeffs[j], V);\
+    }\
+    \
+    /* scale and clip signals */\
+    Y0 = vec_sra (Y0, SCL);\
+    Y1 = vec_sra (Y1, SCL);\
+    U  = vec_sra (U,  SCL);\
+    V  = vec_sra (V,  SCL);\
+    \
+    Y0 = vec_clip (Y0);\
+    Y1 = vec_clip (Y1);\
+    U  = vec_clip (U);\
+    V  = vec_clip (V);\
+    \
+    /* now we have\
+      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7\
+      \
+      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7\
+      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7\
+    */\
+    \
+    U0 = vec_mergeh (U,U);\
+    V0 = vec_mergeh (V,V);\
+    \
+    U1 = vec_mergel (U,U);\
+    V1 = vec_mergel (V,V);\
+    \
+    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);\
+    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);\
+    \
+    R  = vec_packclp (R0,R1);\
+    G  = vec_packclp (G0,G1);\
+    B  = vec_packclp (B0,B1);\
+    \
+    out_fmt (R,G,B,out);\
+  }\
+  \
+  if (i < dstW) {\
+    i -= 16;\
+    \
+    Y0 = RND;\
+    Y1 = RND;\
+    /* extract 16 coeffs from lumSrc */\
+    for(j=0; j<lumFilterSize; j++) {\
+      X0 = vec_ld (0,  &lumSrc[j][i]);\
+      X1 = vec_ld (16, &lumSrc[j][i]);\
+      Y0 = vec_mradds (X0, YCoeffs[j], Y0);\
+      Y1 = vec_mradds (X1, YCoeffs[j], Y1);\
+    }\
+    \
+    U = RND;\
+    V = RND;\
+    /* extract 8 coeffs from U,V */\
+    for(j=0; j<chrFilterSize; j++) {\
+      X  = vec_ld (0, &chrSrc[j][i/2]);\
+      U  = vec_mradds (X, CCoeffs[j], U);\
+      X  = vec_ld (0, &chrSrc[j][i/2+2048]);\
+      V  = vec_mradds (X, CCoeffs[j], V);\
+    }\
+    \
+    /* scale and clip signals */\
+    Y0 = vec_sra (Y0, SCL);\
+    Y1 = vec_sra (Y1, SCL);\
+    U  = vec_sra (U,  SCL);\
+    V  = vec_sra (V,  SCL);\
+    \
+    Y0 = vec_clip (Y0);\
+    Y1 = vec_clip (Y1);\
+    U  = vec_clip (U);\
+    V  = vec_clip (V);\
+    \
+    /* now we have\
+       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7\
+       \
+       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15\
+       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7\
+       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7\
+    */\
+    \
+    U0 = vec_mergeh (U,U);\
+    V0 = vec_mergeh (V,V);\
+    \
+    U1 = vec_mergel (U,U);\
+    V1 = vec_mergel (V,V);\
+    \
+    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);\
+    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);\
+    \
+    R  = vec_packclp (R0,R1);\
+    G  = vec_packclp (G0,G1);\
+    B  = vec_packclp (B0,B1);\
+    \
+    nout = (vector unsigned char *)scratch;\
+    out_fmt (R,G,B,nout);\
+    \
+    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);\
+  }
 
 void
 altivec_yuv2packedX (SwsContext *c,
@@ -793,153 +912,23 @@
 
   out = (vector unsigned char *)dest;
 
-  for(i=0; i<dstW; i+=16){
-    Y0 = RND;
-    Y1 = RND;
-    /* extract 16 coeffs from lumSrc */
-    for(j=0; j<lumFilterSize; j++) {
-      X0 = vec_ld (0,  &lumSrc[j][i]);
-      X1 = vec_ld (16, &lumSrc[j][i]);
-      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-    }
-
-    U = RND;
-    V = RND;
-    /* extract 8 coeffs from U,V */
-    for(j=0; j<chrFilterSize; j++) {
-      X  = vec_ld (0, &chrSrc[j][i/2]);
-      U  = vec_mradds (X, CCoeffs[j], U);
-      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-      V  = vec_mradds (X, CCoeffs[j], V);
-    }
-
-    /* scale and clip signals */
-    Y0 = vec_sra (Y0, SCL);
-    Y1 = vec_sra (Y1, SCL);
-    U  = vec_sra (U,  SCL);
-    V  = vec_sra (V,  SCL);
-
-    Y0 = vec_clip (Y0);
-    Y1 = vec_clip (Y1);
-    U  = vec_clip (U);
-    V  = vec_clip (V);
-
-    /* now we have
-      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
-
-      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-    */
-
-    U0 = vec_mergeh (U,U);
-    V0 = vec_mergeh (V,V);
-
-    U1 = vec_mergel (U,U);
-    V1 = vec_mergel (V,V);
-
-    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-    R  = vec_packclp (R0,R1);
-    G  = vec_packclp (G0,G1);
-    B  = vec_packclp (B0,B1);
-
-    switch(c->dstFormat) {
-      case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
-      case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
-      case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
-      case IMGFMT_ARGB: out_argb (R,G,B,out); break;
-      case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
-      case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
-      default:
-        {
-          /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
-          static int printed_error_message;
-          if(!printed_error_message) {
-            MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
-                    vo_format_name(c->dstFormat));
-            printed_error_message=1;
-          }
-          return;
-        }
-    }
-  }
-
-  if (i < dstW) {
-    i -= 16;
-
-    Y0 = RND;
-    Y1 = RND;
-    /* extract 16 coeffs from lumSrc */
-    for(j=0; j<lumFilterSize; j++) {
-      X0 = vec_ld (0,  &lumSrc[j][i]);
-      X1 = vec_ld (16, &lumSrc[j][i]);
-      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-    }
-
-    U = RND;
-    V = RND;
-    /* extract 8 coeffs from U,V */
-    for(j=0; j<chrFilterSize; j++) {
-      X  = vec_ld (0, &chrSrc[j][i/2]);
-      U  = vec_mradds (X, CCoeffs[j], U);
-      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-      V  = vec_mradds (X, CCoeffs[j], V);
-    }
-
-    /* scale and clip signals */
-    Y0 = vec_sra (Y0, SCL);
-    Y1 = vec_sra (Y1, SCL);
-    U  = vec_sra (U,  SCL);
-    V  = vec_sra (V,  SCL);
-
-    Y0 = vec_clip (Y0);
-    Y1 = vec_clip (Y1);
-    U  = vec_clip (U);
-    V  = vec_clip (V);
-
-    /* now we have
-       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
-
-       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-    */
-
-    U0 = vec_mergeh (U,U);
-    V0 = vec_mergeh (V,V);
-
-    U1 = vec_mergel (U,U);
-    V1 = vec_mergel (V,V);
-
-    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-    R  = vec_packclp (R0,R1);
-    G  = vec_packclp (G0,G1);
-    B  = vec_packclp (B0,B1);
-
-    nout = (vector unsigned char *)scratch;
-    switch(c->dstFormat) {
-      case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
-      case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
-      case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
-      case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
-      case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
-      case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
-      default:
-        /* Unreachable, I think. */
-        MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
-                vo_format_name(c->dstFormat));
-        return;
-    }
-
-    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
+  switch(c->dstFormat) {
+    case IMGFMT_ABGR: YSCALE_YUV_2_RGBX_ALTIVEC(out_abgr); break;
+    case IMGFMT_BGRA: YSCALE_YUV_2_RGBX_ALTIVEC(out_bgra); break;
+    case IMGFMT_RGBA: YSCALE_YUV_2_RGBX_ALTIVEC(out_rgba); break;
+    case IMGFMT_ARGB: YSCALE_YUV_2_RGBX_ALTIVEC(out_argb); break;
+    case IMGFMT_RGB24: YSCALE_YUV_2_RGBX_ALTIVEC(out_rgb24); break;
+    case IMGFMT_BGR24: YSCALE_YUV_2_RGBX_ALTIVEC(out_bgr24); break;
+    default:
+      {
+	/* FIXME: possibly call yuv2packedXinC here */
+	static int printed_error_message;
+	if(!printed_error_message) {
+	  MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
+		  vo_format_name(c->dstFormat));
+	  printed_error_message=1;
+	}
+	return;
+      }
   }
-
 }
-------------- next part --------------
--- yuv2rgb_altivec.c	2006-02-11 16:54:07.000000000 -0500
+++ yuv2rgb_altivec.c.try1	2006-02-11 16:53:57.000000000 -0500
@@ -767,6 +767,158 @@
  return;
 }
 
+#define DEFCSP420_CVT_SCALED(name,out_pixels)                              \
+static int altivec_scaled_##name (SwsContext *c,                           \
+		  int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, \
+		  int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, \
+		  uint8_t *dest, int dstW, int dstY)                       \
+{                                                                          \
+  int i,j;                                                                 \
+  short *f;                                                                \
+  vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;                       \
+  vector signed short R0,G0,B0,R1,G1,B1;                                   \
+									   \
+  vector unsigned char R,G,B,pels[3];                                      \
+  vector unsigned char *out,*nout;                                         \
+									   \
+  vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0); \
+  vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);  \
+  unsigned long scratch[16] __attribute__ ((aligned (16)));                \
+									   \
+  vector signed short *YCoeffs, *CCoeffs;                                  \
+									   \
+  YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;                            \
+  CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;                            \
+									   \
+  out = (vector unsigned char *)dest;                                      \
+									   \
+  for(i=0; i<dstW; i+=16){                                                 \
+    Y0 = RND;                                                              \
+    Y1 = RND;                                                              \
+    /* extract 16 coeffs from lumSrc */                                    \
+    for(j=0; j<lumFilterSize; j++) {                                       \
+      X0 = vec_ld (0,  &lumSrc[j][i]);                                     \
+      X1 = vec_ld (16, &lumSrc[j][i]);                                     \
+      Y0 = vec_mradds (X0, YCoeffs[j], Y0);                                \
+      Y1 = vec_mradds (X1, YCoeffs[j], Y1);                                \
+    }                                                                      \
+									   \
+    U = RND;                                                               \
+    V = RND;                                                               \
+    /* extract 8 coeffs from U,V */                                        \
+    for(j=0; j<chrFilterSize; j++) {                                       \
+      X  = vec_ld (0, &chrSrc[j][i/2]);                                    \
+      U  = vec_mradds (X, CCoeffs[j], U);                                  \
+      X  = vec_ld (0, &chrSrc[j][i/2+2048]);                               \
+      V  = vec_mradds (X, CCoeffs[j], V);                                  \
+    }                                                                      \
+									   \
+    /* scale and clip signals */                                           \
+    Y0 = vec_sra (Y0, SCL);                                                \
+    Y1 = vec_sra (Y1, SCL);                                                \
+    U  = vec_sra (U,  SCL);                                                \
+    V  = vec_sra (V,  SCL);                                                \
+									   \
+    Y0 = vec_clip (Y0);                                                    \
+    Y1 = vec_clip (Y1);                                                    \
+    U  = vec_clip (U);                                                     \
+    V  = vec_clip (V);                                                     \
+									   \
+    /* now we have                                                         \
+      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15    \
+      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7           \
+									   \
+      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15     \
+      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7           \
+      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7           \
+    */                                                                     \
+									   \
+    U0 = vec_mergeh (U,U);                                                 \
+    V0 = vec_mergeh (V,V);                                                 \
+									   \
+    U1 = vec_mergel (U,U);                                                 \
+    V1 = vec_mergel (V,V);                                                 \
+									   \
+    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);                                 \
+    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);                                 \
+									   \
+    R  = vec_packclp (R0,R1);                                              \
+    G  = vec_packclp (G0,G1);                                              \
+    B  = vec_packclp (B0,B1);                                              \
+									   \
+    out_pixels(R,G,B,out);						   \
+  }                                                                        \
+									   \
+  if (i < dstW) {                                                          \
+    i -= 16;                                                               \
+									   \
+    Y0 = RND;                                                              \
+    Y1 = RND;                                                              \
+    /* extract 16 coeffs from lumSrc */                                    \
+    for(j=0; j<lumFilterSize; j++) {                                       \
+      X0 = vec_ld (0,  &lumSrc[j][i]);                                     \
+      X1 = vec_ld (16, &lumSrc[j][i]);                                     \
+      Y0 = vec_mradds (X0, YCoeffs[j], Y0);                                \
+      Y1 = vec_mradds (X1, YCoeffs[j], Y1);                                \
+    }                                                                      \
+									   \
+    U = RND;                                                               \
+    V = RND;                                                               \
+    /* extract 8 coeffs from U,V */                                        \
+    for(j=0; j<chrFilterSize; j++) {                                       \
+      X  = vec_ld (0, &chrSrc[j][i/2]);                                    \
+      U  = vec_mradds (X, CCoeffs[j], U);                                  \
+      X  = vec_ld (0, &chrSrc[j][i/2+2048]);                               \
+      V  = vec_mradds (X, CCoeffs[j], V);                                  \
+    }                                                                      \
+									   \
+    /* scale and clip signals */                                           \
+    Y0 = vec_sra (Y0, SCL);                                                \
+    Y1 = vec_sra (Y1, SCL);                                                \
+    U  = vec_sra (U,  SCL);                                                \
+    V  = vec_sra (V,  SCL);                                                \
+									   \
+    Y0 = vec_clip (Y0);                                                    \
+    Y1 = vec_clip (Y1);                                                    \
+    U  = vec_clip (U);                                                     \
+    V  = vec_clip (V);                                                     \
+									   \
+    /* now we have                                                         \
+       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15   \
+       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7          \
+									   \
+       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15    \
+       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7          \
+       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7          \
+    */                                                                     \
+									   \
+    U0 = vec_mergeh (U,U);                                                 \
+    V0 = vec_mergeh (V,V);                                                 \
+									   \
+    U1 = vec_mergel (U,U);                                                 \
+    V1 = vec_mergel (V,V);                                                 \
+									   \
+    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);                                 \
+    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);                                 \
+									   \
+    R  = vec_packclp (R0,R1);                                              \
+    G  = vec_packclp (G0,G1);                                              \
+    B  = vec_packclp (B0,B1);                                              \
+									   \
+    nout = (vector unsigned char *)scratch;                                \
+    out_pixels(R,G,B,nout);						   \
+									   \
+    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);                   \
+  }                                                                        \
+									   \
+}
+
+DEFCSP420_CVT_SCALED (yuv2_abgr32, out_abgr)
+DEFCSP420_CVT_SCALED (yuv2_bgra32, out_argb)
+DEFCSP420_CVT_SCALED (yuv2_rgba32, out_rgba)
+DEFCSP420_CVT_SCALED (yuv2_argb32, out_argb)
+DEFCSP420_CVT_SCALED (yuv2_rgb24,  out_rgb24)
+DEFCSP420_CVT_SCALED (yuv2_bgr24,  out_bgr24)
 
 void
 altivec_yuv2packedX (SwsContext *c,
@@ -774,172 +926,47 @@
 		       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 		       uint8_t *dest, int dstW, int dstY)
 {
-  int i,j;
-  short *f;
-  vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
-  vector signed short R0,G0,B0,R1,G1,B1;
-
-  vector unsigned char R,G,B,pels[3];
-  vector unsigned char *out,*nout;
-
-  vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
-  vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
-  unsigned long scratch[16] __attribute__ ((aligned (16)));
-
-  vector signed short *YCoeffs, *CCoeffs;
-
-  YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
-  CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
-
-  out = (vector unsigned char *)dest;
-
-  for(i=0; i<dstW; i+=16){
-    Y0 = RND;
-    Y1 = RND;
-    /* extract 16 coeffs from lumSrc */
-    for(j=0; j<lumFilterSize; j++) {
-      X0 = vec_ld (0,  &lumSrc[j][i]);
-      X1 = vec_ld (16, &lumSrc[j][i]);
-      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-    }
-
-    U = RND;
-    V = RND;
-    /* extract 8 coeffs from U,V */
-    for(j=0; j<chrFilterSize; j++) {
-      X  = vec_ld (0, &chrSrc[j][i/2]);
-      U  = vec_mradds (X, CCoeffs[j], U);
-      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-      V  = vec_mradds (X, CCoeffs[j], V);
-    }
-
-    /* scale and clip signals */
-    Y0 = vec_sra (Y0, SCL);
-    Y1 = vec_sra (Y1, SCL);
-    U  = vec_sra (U,  SCL);
-    V  = vec_sra (V,  SCL);
-
-    Y0 = vec_clip (Y0);
-    Y1 = vec_clip (Y1);
-    U  = vec_clip (U);
-    V  = vec_clip (V);
-
-    /* now we have
-      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
-
-      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-    */
-
-    U0 = vec_mergeh (U,U);
-    V0 = vec_mergeh (V,V);
-
-    U1 = vec_mergel (U,U);
-    V1 = vec_mergel (V,V);
-
-    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-    R  = vec_packclp (R0,R1);
-    G  = vec_packclp (G0,G1);
-    B  = vec_packclp (B0,B1);
-
-    switch(c->dstFormat) {
-      case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
-      case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
-      case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
-      case IMGFMT_ARGB: out_argb (R,G,B,out); break;
-      case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
-      case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
-      default:
-        {
-          /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
-          static int printed_error_message;
-          if(!printed_error_message) {
-            MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
-                    vo_format_name(c->dstFormat));
-            printed_error_message=1;
-          }
-          return;
-        }
-    }
-  }
-
-  if (i < dstW) {
-    i -= 16;
-
-    Y0 = RND;
-    Y1 = RND;
-    /* extract 16 coeffs from lumSrc */
-    for(j=0; j<lumFilterSize; j++) {
-      X0 = vec_ld (0,  &lumSrc[j][i]);
-      X1 = vec_ld (16, &lumSrc[j][i]);
-      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-    }
-
-    U = RND;
-    V = RND;
-    /* extract 8 coeffs from U,V */
-    for(j=0; j<chrFilterSize; j++) {
-      X  = vec_ld (0, &chrSrc[j][i/2]);
-      U  = vec_mradds (X, CCoeffs[j], U);
-      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-      V  = vec_mradds (X, CCoeffs[j], V);
-    }
-
-    /* scale and clip signals */
-    Y0 = vec_sra (Y0, SCL);
-    Y1 = vec_sra (Y1, SCL);
-    U  = vec_sra (U,  SCL);
-    V  = vec_sra (V,  SCL);
-
-    Y0 = vec_clip (Y0);
-    Y1 = vec_clip (Y1);
-    U  = vec_clip (U);
-    V  = vec_clip (V);
-
-    /* now we have
-       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
-
-       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-    */
-
-    U0 = vec_mergeh (U,U);
-    V0 = vec_mergeh (V,V);
-
-    U1 = vec_mergel (U,U);
-    V1 = vec_mergel (V,V);
-
-    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-    R  = vec_packclp (R0,R1);
-    G  = vec_packclp (G0,G1);
-    B  = vec_packclp (B0,B1);
-
-    nout = (vector unsigned char *)scratch;
-    switch(c->dstFormat) {
-      case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
-      case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
-      case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
-      case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
-      case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
-      case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
-      default:
-        /* Unreachable, I think. */
-        MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
-                vo_format_name(c->dstFormat));
-        return;
-    }
-
-    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
+  switch(c->dstFormat) {
+    case IMGFMT_ABGR:
+      altivec_scaled_yuv2_abgr32(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    case IMGFMT_BGRA:
+      altivec_scaled_yuv2_bgra32(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    case IMGFMT_RGBA:
+      altivec_scaled_yuv2_rgba32(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    case IMGFMT_ARGB:
+      altivec_scaled_yuv2_argb32(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    case IMGFMT_RGB24:
+      altivec_scaled_yuv2_rgb24(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    case IMGFMT_BGR24:
+      altivec_scaled_yuv2_bgr24(c,
+	lumFilter, lumSrc, lumFilterSize,
+	chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY);
+      break;
+    default:
+      {
+	/* FIXME: either write more converters or punt to yuv2packedXinC */
+	static int printed_error_message;
+	if(!printed_error_message) {
+	  MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
+		  vo_format_name(c->dstFormat));
+	  printed_error_message=1;
+	}
+	return;
+      }
   }
-
 }


More information about the MPlayer-dev-eng mailing list