[MPlayer-dev-eng] [PATCH 7/9] libswscale: Optimize yuv2rgb conversion for avr32

Mon Feb 16 17:16:56 CET 2009

Implemented by Ronny Pedersen.

Signed-off-by: Hans-Christian Egtvedt <hans-christian.egtvedt at atmel.com>
---
 libswscale/Makefile           |    1 +
 libswscale/pico-avr32.h       |  134 +++++++++++++
 libswscale/swscale_internal.h |    3 +
 libswscale/yuv2rgb.c          |   11 +-
 libswscale/yuv2rgb_avr32.c    |  413 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 561 insertions(+), 1 deletions(-)
 create mode 100644 libswscale/pico-avr32.h
 create mode 100644 libswscale/yuv2rgb_avr32.c

diff --git a/libswscale/Makefile b/libswscale/Makefile
index a959661..cd4da7f 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -7,6 +7,7 @@ HEADERS = swscale.h
 
 OBJS = rgb2rgb.o swscale.o swscale_avoption.o
 
+OBJS-$(ARCH_AVR32)         +=  yuv2rgb_avr32.o
 OBJS-$(ARCH_BFIN)          +=  internal_bfin.o swscale_bfin.o yuv2rgb_bfin.o
 OBJS-$(CONFIG_GPL)         +=  yuv2rgb.o
 OBJS-$(CONFIG_MLIB)        +=  yuv2rgb_mlib.o
diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
new file mode 100644
index 0000000..2df5c2e
--- /dev/null
+++ b/libswscale/pico-avr32.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2007-2009 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+#ifndef __PICO_H__
+#define __PICO_H__
+
+/* Coprocessor Number */
+#define PICO_CPNO 1
+
+/* Pixel Coprocessor Register file */
+#define PICO_REGVECT_INPIX2  cr0
+#define PICO_REGVECT_INPIX1  cr1
+#define PICO_REGVECT_INPIX0  cr2
+#define PICO_REGVECT_OUTPIX2 cr3
+#define PICO_REGVECT_OUTPIX1 cr4
+#define PICO_REGVECT_OUTPIX0 cr5
+#define PICO_REGVECT_COEFF0_A cr6
+#define PICO_REGVECT_COEFF0_B cr7
+#define PICO_REGVECT_COEFF1_A cr8
+#define PICO_REGVECT_COEFF1_B cr9
+#define PICO_REGVECT_COEFF2_A cr10
+#define PICO_REGVECT_COEFF2_B cr11
+#define PICO_REGVECT_VMU0_OUT cr12
+#define PICO_REGVECT_VMU1_OUT cr13
+#define PICO_REGVECT_VMU2_OUT cr14
+#define PICO_REGVECT_CONFIG   cr15
+
+#define PICO_INPIX2  0
+#define PICO_INPIX1  1
+#define PICO_INPIX0  2
+#define PICO_OUTPIX2 3
+#define PICO_OUTPIX1 4
+#define PICO_OUTPIX0 5
+#define PICO_COEFF0_A 6
+#define PICO_COEFF0_B 7
+#define PICO_COEFF1_A 8
+#define PICO_COEFF1_B 9
+#define PICO_COEFF2_A 10
+#define PICO_COEFF2_B 11
+#define PICO_VMU0_OUT 12
+#define PICO_VMU1_OUT 13
+#define PICO_VMU2_OUT 14
+#define PICO_CONFIG   15
+
+/* Config Register */
+#define PICO_COEFF_FRAC_BITS  0
+#define PICO_COEFF_FRAC_BITS_WIDTH  4
+#define PICO_OFFSET_FRAC_BITS  4
+#define PICO_OFFSET_FRAC_BITS_WIDTH  4
+#define PICO_INPUT_MODE  8
+#define PICO_INPUT_MODE_WIDTH  2
+#define PICO_OUTPUT_MODE 10
+
+#define PICO_TRANSFORMATION_MODE 0
+#define PICO_HOR_FILTER_MODE 1
+#define PICO_VERT_FILTER_MODE 2
+
+#define PICO_PLANAR_MODE 1
+#define PICO_PACKED_MODE 0
+
+/* Bits in coefficients */
+#define PICO_COEFF_BITS 12
+
+/* Operation bits */
+#define PICO_USE_ACC (1 << 2)
+#define PICO_SINGLE_VECTOR (1 << 3)
+
+#define __str(x...) #x
+#define __xstr(x...) __str(x)
+
+#define PICO_PUT_W(pico_reg, x) \
+  __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
+#define PICO_GET_W(pico_reg) \
+  __builtin_mvcr_w(PICO_CPNO, pico_reg)
+
+#define PICO_PUT_D(pico_reg, x) \
+  __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
+#define PICO_GET_D(pico_reg) \
+  __builtin_mvcr_d(PICO_CPNO, pico_reg)
+
+#define PICO_STCM_W(ptr, pico_regs...) \
+  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr));
+#define PICO_STCM_D(ptr, pico_regs...) \
+  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr));
+
+#define PICO_STCM_W_DEC(ptr, pico_regs...) \
+  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr));
+#define PICO_STCM_D_DEC(ptr, pico_regs...) \
+  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr));
+
+#define PICO_LDCM_W(ptr, pico_regs...) \
+  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr));
+#define PICO_LDCM_D(ptr, pico_regs...) \
+  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr));
+
+#define PICO_LDCM_W_INC(ptr, pico_regs...) \
+  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr));
+#define PICO_LDCM_D_INC(ptr, pico_regs...) \
+  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr));
+
+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
+  __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
+
+#endif /* __PICO_H__ */
+
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index cf15742..c317a68 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -222,6 +222,9 @@ void altivec_yuv2packedX (SwsContext *c,
                           int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                           uint8_t *dest, int dstW, int dstY);
 
+SwsFunc yuv2rgb_init_avr32 (SwsContext *c);
+int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+
 const char *sws_format_name(int format);
 
 //FIXME replace this with something faster
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index fe90a04..a8bec94 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -520,7 +520,12 @@ SwsFunc sws_yuv2rgb_get_func_ptr (SwsContext *c)
         if (t) return t;
     }
 #endif
-
+#ifdef ARCH_AVR32
+    {
+        SwsFunc t = yuv2rgb_init_avr32(c);
+        if (t) return t;
+    }
+#endif
 #if ARCH_BFIN
     if (c->flags & SWS_CPU_CAPS_BFIN)
     {
@@ -613,6 +618,10 @@ int sws_yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRa
 //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
     oy -= 256*brightness;
 
+#ifdef ARCH_AVR32
+    yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
+#endif
+
     for (i = 0; i < 1024; i++) {
         int j;
 
diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
new file mode 100644
index 0000000..7283e6f
--- /dev/null
+++ b/libswscale/yuv2rgb_avr32.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+#include <libavutil/avutil.h>
+
+#include "swscale.h"
+#include "swscale_internal.h"
+#include "pico-avr32.h"
+
+#define RGB(uv_part)  \
+      __asm__ volatile (        \
+                        "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
+                        "ld.w\t%1, %4[%8:" uv_part "  << 2]\n\t" /* g = c->table_gU[U] */  \
+                        "ld.w\t%2, %5[%8:" uv_part "  << 2]\n\t" /* b = c->table_bU[U] */  \
+                        "add\t%1, %0\n\t" /* g += tmp */\
+                        "ld.w\t%0, %6[%7:" uv_part "  << 2]" /* r = c->table_rV[V] */ \
+                        : "=&r" (r), "=&r" (g), "=&r" (b) \
+                        : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
+                        "r" (&c->table_rV[0]), "r" (V), "r" (U));
+
+#undef YUV2RGB1
+#define YUV2RGB1(dst, src, y, idx) \
+  { int tmp2;    __asm__ volatile (      \
+                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[2] = tmp; */   \
+                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[0] = tmp; */   \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 3], %1\n\t"         /* dst_1[5] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
+                        "st.b\t%7[6*%8 + 5], %1"         /* dst_1[3] = tmp; */   \
+                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
+                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
+
+#undef YUV2RGB2
+#define YUV2RGB2(dst, src, y, idx) \
+  { int tmp2;    __asm__ volatile (      \
+                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[2] = tmp; */   \
+                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[0] = tmp; */   \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 3], %1\n\t"         /* dst_1[5] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
+                        "st.b\t%7[6*%8 + 5], %1"         /* dst_1[3] = tmp; */   \
+                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
+                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
+
+
+#undef YUV2BGR1
+#define YUV2BGR1(dst, src, y, idx) \
+  { int tmp2;    __asm__ volatile (      \
+                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[2] = tmp; */   \
+                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[0] = tmp; */   \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 5], %1\n\t"         /* dst_1[5] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
+                        "st.b\t%7[6*%8 + 3], %1"         /* dst_1[3] = tmp; */   \
+                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
+                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
+
+#undef YUV2BGR2
+#define YUV2BGR2(dst, src, y, idx) \
+  { int tmp2;    __asm__ volatile (      \
+                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[2] = tmp; */   \
+                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
+                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
+                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[0] = tmp; */   \
+                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
+                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 5], %1\n\t"         /* dst_1[5] = tmp; */   \
+                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
+                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
+                        "st.b\t%7[6*%8 + 3], %1"         /* dst_1[3] = tmp; */   \
+                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
+                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
+
+int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                    int srcSliceH, uint8_t* dst[], int dstStride[]){
+  int y;
+
+  if(c->srcFormat == PIX_FMT_YUV422P){
+    srcStride[1] *= 2;
+    srcStride[2] *= 2;
+  }
+
+
+  for(y=0; y<srcSliceH; y+=2){
+    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
+    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
+    uint32_t *r, *g, *b;
+    uint8_t *py_1= src[0] + y*srcStride[0];
+    uint8_t *py_2= py_1 + srcStride[0];
+    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
+    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
+    unsigned int h_size= c->dstW>>3;
+    while (h_size--) {
+      uint32_t U, V, Y1, Y2, tmp;
+      U = ((uint32_t*)pu)[0];
+      V = ((uint32_t*)pv)[0];
+
+      RGB("t")
+        YUV2BGR1(dst_1, py_1, Y1, 0)
+        YUV2BGR1(dst_2, py_2, Y2, 0)
+
+      RGB("u")
+        YUV2BGR2(dst_1, py_1, Y1, 1)
+        YUV2BGR2(dst_2, py_2, Y2, 1)
+
+      RGB("l")
+        YUV2BGR1(dst_1, py_1, Y1, 2)
+        YUV2BGR1(dst_2, py_2, Y2, 2)
+
+      RGB("b")
+        YUV2BGR2(dst_1, py_1, Y1, 3)
+        YUV2BGR2(dst_2, py_2, Y2, 3)
+
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      py_2 += 8;
+      dst_1 += 24;
+      dst_2 += 24;
+    }
+  }
+  return srcSliceH;
+}
+
+
+
+static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                             int srcSliceH, uint8_t* dst[], int dstStride[]){
+  int y;
+
+  if(c->srcFormat == PIX_FMT_YUV422P){
+    srcStride[1] *= 2;
+    srcStride[2] *= 2;
+  }
+  for(y=0; y<srcSliceH; y+=2){
+    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
+    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
+    uint8_t *r, *g, *b;
+    uint8_t *py_1= src[0] + y*srcStride[0];
+    uint8_t *py_2= py_1 + srcStride[0];
+    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
+    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
+    unsigned int h_size= c->dstW>>3;
+    while (h_size--) {
+      uint32_t U, V, Y1, Y2, tmp;
+      U = ((uint32_t*)pu)[0];
+      V = ((uint32_t*)pv)[0];
+
+      RGB("t")
+        YUV2RGB1(dst_1, py_1, Y1, 0)
+        YUV2RGB1(dst_2, py_2, Y2, 0)
+
+      RGB("u")
+        YUV2RGB2(dst_1, py_1, Y1, 1)
+        YUV2RGB2(dst_2, py_2, Y2, 1)
+
+      RGB("l")
+        YUV2RGB1(dst_1, py_1, Y1, 2)
+        YUV2RGB1(dst_2, py_2, Y2, 2)
+
+      RGB("b")
+        YUV2RGB2(dst_1, py_1, Y1, 3)
+        YUV2RGB2(dst_2, py_2, Y2, 3)
+
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      py_2 += 8;
+      dst_1 += 24;
+      dst_2 += 24;
+    }
+  }
+  return srcSliceH;
+}
+
+#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
+#define COEFF_FRAC_BITS  9
+#define OFFSET_FRAC_BITS  2
+
+/* Coefficients used in the pico */
+static struct {
+  short coeff2_2;
+  short coeff2_3;
+  short coeff2_0;
+  short coeff2_1;
+  short coeff1_2;
+  short coeff1_3;
+  short coeff1_0;
+  short coeff1_1;
+  short coeff0_2;
+  short coeff0_3;
+  short coeff0_0;
+  short coeff0_1;
+} pico_coeff;
+
+
+static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                int srcSliceH, uint8_t* dst[], int dstStride[]){
+  int y;
+
+  /* Initialize pico */
+  PICO_LDCM_D(&pico_coeff,
+              PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
+              PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
+              PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
+
+  PICO_PUT_W(PICO_CONFIG,
+             (PICO_PACKED_MODE << PICO_OUTPUT_MODE
+              | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
+              | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
+              | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
+
+
+  if(c->srcFormat == PIX_FMT_YUV422P){
+    srcStride[1] *= 2;
+    srcStride[2] *= 2;
+  }
+
+  for(y=0; y<srcSliceH; y+=2){
+    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
+    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
+    uint8_t *py_1= src[0] + y*srcStride[0];
+    uint8_t *py_2= py_1 + srcStride[0];
+    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
+    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
+    unsigned int h_size= c->dstW>>3;
+    int *py_1_int = (int *)py_1;
+    int *py_2_int = (int *)py_2;
+    int *pu_int = (int *)pu;
+    int *pv_int = (int *)pv;
+    while (h_size--) {
+      PICO_PUT_W(PICO_INPIX0, *py_1_int++);
+      PICO_PUT_W(PICO_INPIX1, *pu_int++);
+      PICO_PUT_W(PICO_INPIX2, *pv_int++);
+      PICO_OP(0, 0, 0, 4, 8);
+      PICO_OP(0, 1, 1, 4, 8);
+      PICO_OP(0, 2, 2, 5, 9);
+      PICO_OP(0, 3, 3, 5, 9);
+      PICO_PUT_W(PICO_INPIX0, *py_1_int++);
+      PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
+      PICO_OP(0, 0, 0, 6, 10);
+      PICO_OP(0, 1, 1, 6, 10);
+      PICO_OP(0, 2, 2, 7, 11);
+      PICO_OP(0, 3, 3, 7, 11);
+      PICO_PUT_W(PICO_INPIX0, *py_2_int++);
+      PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
+
+      PICO_OP(0, 0, 0, 4, 8);
+      PICO_OP(0, 1, 1, 4, 8);
+      PICO_OP(0, 2, 2, 5, 9);
+      PICO_OP(0, 3, 3, 5, 9);
+      PICO_PUT_W(PICO_INPIX0, *py_2_int++);
+      PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
+      PICO_OP(0, 0, 0, 6, 10);
+      PICO_OP(0, 1, 1, 6, 10);
+      PICO_OP(0, 2, 2, 7, 11);
+      PICO_OP(0, 3, 3, 7, 11);
+      PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
+
+      dst_1 += 24;
+      dst_2 += 24;
+    }
+  }
+  return srcSliceH;
+}
+
+extern int avr32_use_pico;
+
+SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
+  switch(c->dstFormat){
+  case PIX_FMT_BGR24:
+    {
+      if ( avr32_use_pico ){
+        av_log(c, AV_LOG_INFO, "AVR32 BGR24: Using PICO for color space conversion\n");
+        return yuv2bgr24_avr32_pico;
+      } else {
+        av_log(c, AV_LOG_INFO, "AVR32 BGR24: Using optimized color space conversion\n");
+        return yuv2bgr24_avr32;
+      }
+    }
+    break;
+  case PIX_FMT_RGB24:
+    {
+      if ( avr32_use_pico ){
+        av_log(c, AV_LOG_INFO, "AVR32 RGB24: Using PICO for color space conversion\n");
+        return yuv2bgr24_avr32_pico;
+      } else {
+        av_log(c, AV_LOG_INFO, "AVR32 RGB24: Using optimized color space conversion\n");
+        return yuv2rgb24_avr32;
+      }
+    }
+  }
+  return NULL;
+}
+
+
+int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
+  const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
+
+  int64_t crv =  inv_table[0];
+  int64_t cbu =  inv_table[1];
+  int64_t cgu = -inv_table[2];
+  int64_t cgv = -inv_table[3];
+  int64_t cy  = 1<<16;
+  int64_t oy  = 0;
+
+  if(!fullRange){
+    cy= (cy*255) / 219;
+    oy= 16<<16;
+  }
+
+  cy = (cy *contrast             )>>16;
+  crv= (crv*contrast * saturation)>>32;
+  cbu= (cbu*contrast * saturation)>>32;
+  cgu= (cgu*contrast * saturation)>>32;
+  cgv= (cgv*contrast * saturation)>>32;
+
+  oy -= 256*brightness;
+
+  pico_coeff.coeff1_0 = SCALE(cy, (16 - COEFF_FRAC_BITS)); /* G <- Y */
+  pico_coeff.coeff1_1 = SCALE(cgu, (16 - COEFF_FRAC_BITS)); /* G <- U */
+  pico_coeff.coeff1_2 = SCALE(cgv, (16 - COEFF_FRAC_BITS)); /* G <- V */
+  pico_coeff.coeff1_3 = (SCALE((-128*cgu - 128*cgv - 16*cy), (16 - OFFSET_FRAC_BITS))
+                         + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
+
+  if ( isRgb ){
+    pico_coeff.coeff0_0 = SCALE(cy, (16 - COEFF_FRAC_BITS)); /* R <- Y */
+    pico_coeff.coeff0_1 = 0; /* R <- U */
+    pico_coeff.coeff0_2 = SCALE(crv, (16 - COEFF_FRAC_BITS)); /* R <- V */
+    pico_coeff.coeff0_3 = (SCALE((-128*crv - 16*cy), (16 - OFFSET_FRAC_BITS))
+                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
+
+    pico_coeff.coeff2_0 = SCALE(cy, (16 - COEFF_FRAC_BITS)); /* B <- Y */
+    pico_coeff.coeff2_1 = SCALE(cbu, (16 - COEFF_FRAC_BITS)); /* B <- U */
+    pico_coeff.coeff2_2 = 0; /* B <- V */
+    pico_coeff.coeff2_3 = (SCALE((-128*cbu - 16*cy), (16 - OFFSET_FRAC_BITS))
+                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
+  } else {
+    pico_coeff.coeff2_0 = SCALE(cy, (16 - COEFF_FRAC_BITS)); /* R <- Y */
+    pico_coeff.coeff2_1 = 0; /* R <- U */
+    pico_coeff.coeff2_2 = SCALE(crv, (16 - COEFF_FRAC_BITS)); /* R <- V */
+    pico_coeff.coeff2_3 = (SCALE((-128*crv - 16*cy), (16 - OFFSET_FRAC_BITS))
+                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
+
+    pico_coeff.coeff0_0 = SCALE(cy, (16 - COEFF_FRAC_BITS)); /* B <- Y */
+    pico_coeff.coeff0_1 = SCALE(cbu, (16 - COEFF_FRAC_BITS)); /* B <- U */
+    pico_coeff.coeff0_2 = 0; /* B <- V */
+    pico_coeff.coeff0_3 = (SCALE((-128*cbu - 16*cy), (16 - OFFSET_FRAC_BITS))
+                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
+  }
+
+  return 0;
+}
+
+#undef RGB
+
-- 
1.5.6.3