[MPlayer-dev-eng] [PATCH 5/9] libavcodec: add AVR32 specific optimizations
Hans-Christian Egtvedt
hans-christian.egtvedt at atmel.com
Mon Feb 16 17:16:54 CET 2009
Implemented by Ronny Pedersen.
Signed-off-by: Hans-Christian Egtvedt <hans-christian.egtvedt at atmel.com>
---
cfg-common-opts.h | 5 +-
cfg-common.h | 4 +
libavcodec/Makefile | 6 +-
libavcodec/avr32/dsputil_avr32.c | 2492 ++++++++++++++++++++++++++++++++++++++
libavcodec/avr32/fdct.S | 538 ++++++++
libavcodec/avr32/h264idct.S | 451 +++++++
libavcodec/avr32/idct.S | 793 ++++++++++++
libavcodec/avr32/mc.S | 434 +++++++
libavcodec/avr32/pico.h | 255 ++++
libavcodec/bitstream.h | 76 ++-
libavcodec/dsputil.c | 1 +
libavcodec/h264.c | 15 +
12 files changed, 5065 insertions(+), 5 deletions(-)
create mode 100644 libavcodec/avr32/dsputil_avr32.c
create mode 100644 libavcodec/avr32/fdct.S
create mode 100644 libavcodec/avr32/h264idct.S
create mode 100644 libavcodec/avr32/idct.S
create mode 100644 libavcodec/avr32/mc.S
create mode 100644 libavcodec/avr32/pico.h
diff --git a/cfg-common-opts.h b/cfg-common-opts.h
index 13d3f38..a7ec2f3 100644
--- a/cfg-common-opts.h
+++ b/cfg-common-opts.h
@@ -250,7 +250,10 @@
{"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
{"psprobe", &ps_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
{"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
-
+#ifdef ARCH_AVR32
+ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
+#endif
// draw by slices or whole frame (useful with libmpeg2/libavcodec)
{"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
{"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
diff --git a/cfg-common.h b/cfg-common.h
index ea5fce6..6330979 100644
--- a/cfg-common.h
+++ b/cfg-common.h
@@ -6,6 +6,10 @@
#include "m_config.h"
#include "m_option.h"
+#ifdef ARCH_AVR32
+extern int avr32_use_pico;
+#endif
+
extern char *mp_msg_charset;
extern int mp_msg_color;
extern int mp_msg_module;
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 7026ada..40cbc66 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -462,6 +462,10 @@ OBJS-$(HAVE_ARMV6) += arm/simple_idct_armv6.o \
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
arm/float_arm_vfp.o \
+OBJS-$(ARCH_AVR32) += avr32/dsputil_avr32.o \
+ avr32/idct.o avr32/fdct.o \
+ avr32/mc.o avr32/h264idct.o \
+
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
arm/mpegvideo_iwmmxt.o \
@@ -517,7 +521,7 @@ TESTS-$(CONFIG_OLDSCALER) += imgresample-test$(EXESUF)
TESTS-$(ARCH_X86) += x86/cpuid-test$(EXESUF) motion-test$(EXESUF)
CLEANFILES = apiexample$(EXESUF)
-DIRS = alpha arm bfin mlib ppc ps2 sh4 sparc x86
+DIRS = alpha arm avr32 bfin mlib ppc ps2 sh4 sparc x86
include $(SUBDIR)../subdir.mak
diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
new file mode 100644
index 0000000..2bc0d27
--- /dev/null
+++ b/libavcodec/avr32/dsputil_avr32.c
@@ -0,0 +1,2492 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "../dsputil.h"
+#include "pico.h"
+
+int avr32_use_pico = 1;
+
+#ifdef CHECK_DSP_FUNCS_AGAINST_C
+#define DSP_FUNC_NAME(name) test_ ## name
+#else
+#define DSP_FUNC_NAME(name) name
+#endif
+
+union doubleword {
+ int64_t doubleword;
+ struct {
+ int32_t top;
+ int32_t bottom;
+ } words;
+};
+
+#undef LD16
+#undef LD32
+#undef LD64
+
+#define LD16(a) (*((uint16_t*)(a)))
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+#define LD64_UNALIGNED(a) \
+ ({ union doubleword __tmp__; \
+ __tmp__.words.top = LD32(a); \
+ __tmp__.words.bottom = LD32(a + 4); \
+ __tmp__.doubleword; })
+
+#undef ST32
+#undef ST16
+
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
+#define ST32(a, b) *((uint32_t*)(a)) = (b)
+
+#undef rnd_avg32
+#define rnd_avg32(a, b) \
+ ({ uint32_t __tmp__;\
+ __asm__("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
+ __tmp__;})
+
+void idct_avr32(DCTELEM *data);
+void fdct_avr32(DCTELEM *data);
+
+void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
+void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
+
+void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
+void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
+
+#define extern_dspfunc(PFX, NUM) \
+ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
+
+extern_dspfunc(put, 8);
+extern_dspfunc(put_no_rnd, 8);
+extern_dspfunc(avg, 8);
+extern_dspfunc(avg_no_rnd, 8);
+#undef extern_dspfunc
+
+#ifdef CHECK_DSP_FUNCS_AGAINST_C
+#define extern_dspfunc(PFX, NUM) \
+ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
+ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
+
+extern_dspfunc(put, 4);
+extern_dspfunc(put_no_rnd, 4);
+extern_dspfunc(put, 8);
+extern_dspfunc(put_no_rnd, 8);
+extern_dspfunc(put, 16);
+extern_dspfunc(put_no_rnd, 16);
+extern_dspfunc(avg, 8);
+extern_dspfunc(avg_no_rnd, 8);
+extern_dspfunc(avg, 16);
+extern_dspfunc(avg_no_rnd, 16);
+
+
+#undef extern_dspfunc
+#define extern_dspfunc(PFX, NUM) \
+void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
+void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
+
+extern_dspfunc(put_h264_qpel, 16);
+extern_dspfunc(put_h264_qpel, 8);
+extern_dspfunc(put_h264_qpel, 4);
+extern_dspfunc(avg_h264_qpel, 16);
+extern_dspfunc(avg_h264_qpel, 8);
+extern_dspfunc(avg_h264_qpel, 4);
+
+#undef extern_dspfunc
+
+void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+
+void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
+
+
+void dump_block8(uint8_t *block, int line_size, int h);
+void dump_block4(uint8_t *block, int line_size, int h);
+void dump_block(uint8_t *block, int line_size, int h, int w);
+
+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, char *name, int max_dev);
+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, char *name, int max_dev);
+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, int width, char *name, int max_dev);
+
+#define PIXOP2( OPNAME, OP ) \
+void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ OP(*((uint32_t*)(block )), LD32(pixels ));\
+ pixels+=line_size;\
+ block +=line_size;\
+ }\
+}\
+void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ uint32_t a,b;\
+ a= LD32(&src1[i*src_stride1 ]);\
+ b= LD32(&src2[i*src_stride2 ]);\
+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
+ a= LD32(&src1[i*src_stride1+4]);\
+ b= LD32(&src2[i*src_stride2+4]);\
+ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
+ }\
+}\
+\
+void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ uint32_t a,b;\
+ a= LD32(&src1[i*src_stride1 ]);\
+ b= LD32(&src2[i*src_stride2 ]);\
+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
+ }\
+}\
+\
+void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
+ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+
+#else
+#define PIXOP2( OPNAME, OP ) \
+static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ OP(*((uint32_t*)(block )), LD32(pixels ));\
+ pixels+=line_size;\
+ block +=line_size;\
+ }\
+}\
+static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ OP(*((uint32_t*)(block )), LD32(pixels ));\
+ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
+ pixels+=line_size;\
+ block +=line_size;\
+ }\
+}\
+static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ OP(*((uint32_t*)(block )), LD32(pixels ));\
+ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
+ OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
+ OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
+ pixels+=line_size;\
+ block +=line_size;\
+ }\
+}\
+static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ uint32_t a,b;\
+ a= LD32(&src1[i*src_stride1 ]);\
+ b= LD32(&src2[i*src_stride2 ]);\
+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
+ a= LD32(&src1[i*src_stride1+4]);\
+ b= LD32(&src2[i*src_stride2+4]);\
+ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
+ }\
+}\
+\
+static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ int i;\
+ for(i=0; i<h; i++){\
+ uint32_t a,b;\
+ a= LD32(&src1[i*src_stride1 ]);\
+ b= LD32(&src2[i*src_stride2 ]);\
+ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
+ }\
+}\
+\
+static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+ int src_stride1, int src_stride2, int h){\
+ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
+ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+
+#endif
+
+#define op_avg(a, b) a = rnd_avg32(a, b)
+#define op_put(a, b) a = b
+
+PIXOP2(avg, op_avg)
+PIXOP2(put, op_put)
+#undef op_avg
+#undef op_put
+
+
+
+static void clear_blocks_avr32(DCTELEM *blocks)
+{
+ int n = 12;
+ uint64_t tmp1, tmp2;
+ blocks += 6*64;
+ __asm__ volatile ( "mov\t%1, 0\n"
+ "mov\t%m1, 0\n"
+ "mov\t%2, 0\n"
+ "mov\t%m2, 0\n"
+ "0:\n"
+ "stm\t--%3, %1, %m1, %2, %m2\n"
+ "stm\t--%3, %1, %m1, %2, %m2\n"
+ "stm\t--%3, %1, %m1, %2, %m2\n"
+ "stm\t--%3, %1, %m1, %2, %m2\n"
+ "sub\t%0, 1\n"
+ "brne\t0b\n"
+ : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
+ "+r"(blocks));
+}
+
+
+static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+
+ int src0 = LD32(src);
+ int src1 = LD32(src + stride);
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
+ src += stride;
+ ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
+ dst += stride;
+ }
+}
+
+
+static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);\
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+ /*
+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
+ dst+= stride;
+ src+= stride;
+ */
+
+ int src0 = LD32(src);
+ int src1 = (((int)src[4] << 24) | (int)src[stride]);
+ int src2 = LD32(src + stride + 1);
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ src += stride;
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+
+ dst += stride;
+ }
+}
+
+static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+ /*
+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
+ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
+ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
+ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
+ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
+ dst+= stride;
+ src+= stride;
+ */
+ int src0 = LD32(src);
+ int src1 = (((int)src[4] << 24) | (int)src[stride]);
+ int src2 = LD32(src + stride + 1);
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+
+ src0 = LD32(src + 4);
+ src1 = (src[8] << 24) | src[stride + 4];
+ src2 = LD32(src + stride + 5);
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ src += stride;
+ ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
+
+ dst += stride;
+ }
+}
+
+
+static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+ int src0 = LD32(src);
+ int src1 = LD32(src + stride);
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
+ src += stride;
+ ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
+ dst += stride;
+ }
+}
+
+
+static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);\
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+ /*
+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
+ dst+= stride;
+ src+= stride;
+ */
+
+ int src0 = *((int *)src);
+ int src1 = (int)((src[4] << 24) | src[stride]);
+ int src2 = *((int *)(src + stride + 1));
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ src += stride;
+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
+ dst += stride;
+ }
+}
+
+static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
+ const int A=(8-x)*(8-y);
+ const int B=( x)*(8-y);
+ const int C=(8-x)*( y);
+ const int D=( x)*( y);
+ int i;
+
+ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF0_B, 32);
+ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
+ PICO_PUT_W(PICO_COEFF1_B, 0);
+ PICO_PUT_W(PICO_COEFF2_A, 0);
+ PICO_PUT_W(PICO_COEFF2_B, 0);
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(6)
+ | PICO_OFFSET_FRAC_BITS(6));
+
+ for(i=0; i<h; i++)
+ {
+ /*
+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
+ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
+ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
+ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
+ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
+ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
+ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
+ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
+ dst+= stride;
+ src+= stride;
+ */
+ int src0 = *((int *)src);
+ int src1 = (volatile int)((src[4] << 24) | src[stride]);
+ int src2 = *((int *)(src + stride + 1));
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
+
+ src0 = *((int *)(src + 4));
+ src1 = (int)((src[8] << 24) | src[stride + 4]);
+ src2 = *((int *)(src + stride + 5));
+
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
+ src += stride;
+ ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
+ dst += stride;
+ }
+}
+
+static struct pico_config_t h264_qpel4_h_lowpass_config = {
+ .input_mode = PICO_HOR_FILTER_MODE,
+ .output_mode = PICO_PLANAR_MODE,
+ .coeff_frac_bits = 5,
+ .offset_frac_bits = 5,
+ .coeff0_0 = 1,
+ .coeff0_1 = -5,
+ .coeff0_2 = 20,
+ .coeff0_3 = 16,
+ .coeff1_0 = 20,
+ .coeff1_1 = -5,
+ .coeff1_2 = 1,
+ .coeff1_3 = 0,
+ .coeff2_0 = 0,
+ .coeff2_1 = 0,
+ .coeff2_2 = 0,
+ .coeff2_3 = 0
+};
+
+
+
+static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ const int h=4;
+ int i;
+
+ set_pico_config(&h264_qpel4_h_lowpass_config);
+
+ for(i=0; i<h; i++){
+
+ /*
+ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
+ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
+ dst+=dstStride;\
+ src+=srcStride;\ */
+ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
+ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
+ src += srcStride;
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+ dst += dstStride;
+ }
+}
+
+static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ const int h=4;
+ int i;
+
+ set_pico_config(&h264_qpel4_h_lowpass_config);
+
+ for(i=0; i<h; i++){
+
+ /*
+ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
+ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
+ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
+ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
+ dst+=dstStride;\
+ src+=srcStride;\
+ */
+
+ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
+ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
+ src += srcStride;
+ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
+ dst += dstStride;
+ }
+}
+
+static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
+ .input_mode = PICO_VERT_FILTER_MODE,
+ .output_mode = PICO_PACKED_MODE,
+ .coeff_frac_bits = 5,
+ .offset_frac_bits = 5,
+ .coeff0_0 = 1,
+ .coeff0_1 = -5,
+ .coeff0_2 = 20,
+ .coeff0_3 = 16,
+ .coeff1_0 = 1,
+ .coeff1_1 = -5,
+ .coeff1_2 = 20,
+ .coeff1_3 = 16,
+ .coeff2_0 = 1,
+ .coeff2_1 = -5,
+ .coeff2_2 = 20,
+ .coeff2_3 = 16
+};
+
+
+
+static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
+ .input_mode = PICO_VERT_FILTER_MODE,
+ .output_mode = PICO_PLANAR_MODE,
+ .coeff_frac_bits = 5,
+ .offset_frac_bits = 5,
+ .coeff0_0 = 1,
+ .coeff0_1 = -5,
+ .coeff0_2 = 20,
+ .coeff0_3 = 16,
+ .coeff1_0 = 20,
+ .coeff1_1 = -5,
+ .coeff1_2 = 1,
+ .coeff1_3 = 0,
+ .coeff2_0 = 0,
+ .coeff2_1 = 0,
+ .coeff2_2 = 0,
+ .coeff2_3 = 0
+};
+
+static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+
+ /*
+ const int w=4;
+ uint8_t *cm = cropTbl + MAX_NEG_CROP;
+ int i;
+ for(i=0; i<w; i++)
+ {
+ const int srcB= src[-2*srcStride];\
+ const int srcA= src[-1*srcStride];\
+ const int src0= src[0 *srcStride];\
+ const int src1= src[1 *srcStride];\
+ const int src2= src[2 *srcStride];\
+ const int src3= src[3 *srcStride];\
+ const int src4= src[4 *srcStride];\
+ const int src5= src[5 *srcStride];\
+ const int src6= src[6 *srcStride];\
+ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+ dst++;\
+ src++;\
+ */
+
+ set_pico_config(&h264_qpel4_v_lowpass_config1);
+
+ {
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ union wordbytes {
+ int word;
+ struct {
+ unsigned int t:8;
+ unsigned int u:8;
+ unsigned int l:8;
+ unsigned int b:8;
+ } bytes;
+ } tmp1, tmp2, tmp3;
+
+ /* First compute the leftmost three colums */
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+ dst += dstStride;
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+ dst += dstStride;
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+ dst += dstStride;
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
+ /* Now compute the last column */
+
+ tmp1.bytes.t = srcB;
+ tmp1.bytes.u = src1;
+ tmp1.bytes.l = src4;
+
+ tmp2.bytes.t = srcA;
+ tmp2.bytes.u = src2;
+ tmp2.bytes.l = src5;
+
+ tmp3.bytes.t = src0;
+ tmp3.bytes.u = src3;
+ tmp3.bytes.l = src6;
+
+ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
+ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
+ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
+ set_pico_config(&h264_qpel4_v_lowpass_config2);
+
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
+
+ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
+ dst[3] = (char)(tmp1.bytes.b);
+ dst[3 - dstStride] = (char)(tmp1.bytes.l);
+ dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
+ dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
+
+ }
+ /*}
+
+
+ }*/
+}
+
+static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+
+ /*
+ const int w=4;
+ uint8_t *cm = cropTbl + MAX_NEG_CROP;
+ int i;
+ for(i=0; i<w; i++)
+ {
+ const int srcB= src[-2*srcStride];\
+ const int srcA= src[-1*srcStride];\
+ const int src0= src[0 *srcStride];\
+ const int src1= src[1 *srcStride];\
+ const int src2= src[2 *srcStride];\
+ const int src3= src[3 *srcStride];\
+ const int src4= src[4 *srcStride];\
+ const int src5= src[5 *srcStride];\
+ const int src6= src[6 *srcStride];\
+ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
+ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
+ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
+ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
+ dst++;\
+ src++;\
+ */
+ uint8_t tmp_block[4*4];
+
+ set_pico_config(&h264_qpel4_v_lowpass_config1);
+
+ {
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ union wordbytes {
+ int word;
+ struct {
+ unsigned int t:8;
+ unsigned int u:8;
+ unsigned int l:8;
+ unsigned int b:8;
+ } bytes;
+ } tmp1, tmp2, tmp3;
+
+ /* First compute the leftmost three colums */
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(0, 0, 0, 3, 6);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
+ ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
+ /* Now compute the last column */
+
+ tmp1.bytes.t = srcB;
+ tmp1.bytes.u = src1;
+ tmp1.bytes.l = src4;
+
+ tmp2.bytes.t = srcA;
+ tmp2.bytes.u = src2;
+ tmp2.bytes.l = src5;
+
+ tmp3.bytes.t = src0;
+ tmp3.bytes.u = src3;
+ tmp3.bytes.l = src6;
+
+ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
+ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
+ PICO_MVRC_W(PICO_INPIX2, tmp3.word);
+ set_pico_config(&h264_qpel4_v_lowpass_config2);
+
+ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
+ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
+ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
+ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
+
+ PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
+ tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
+ tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
+ tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
+ tmp_block[3] = (char)(tmp1.bytes.t);
+
+ /* Compute the average */
+ srcB= LD32(dst);
+ srcA= LD32(dst + dstStride);
+ src0= LD32(dst + dstStride*2);
+ src1= LD32(dst + dstStride*3);
+
+ src2= LD32(tmp_block);
+ src3= LD32(tmp_block + 4);
+ src4= LD32(tmp_block + 8);
+ src5= LD32(tmp_block + 12);
+
+ ST32(dst, rnd_avg32(srcB, src2));
+ ST32(dst + dstStride, rnd_avg32(srcA, src3));
+ ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
+ ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
+ }
+}
+
+static struct pico_config_t h264_qpel4_hv_lowpass_config = {
+ .input_mode = PICO_HOR_FILTER_MODE,
+ .output_mode = PICO_PACKED_MODE,
+ .coeff_frac_bits = 10,
+ .offset_frac_bits = 10,
+ .coeff0_0 = 1,
+ .coeff0_1 = -5,
+ .coeff0_2 = 20,
+ .coeff0_3 = 512,
+ .coeff1_0 = -5,
+ .coeff1_1 = 25,
+ .coeff1_2 = -100,
+ .coeff1_3 = 0,
+ .coeff2_0 = 20,
+ .coeff2_1 = -100,
+ .coeff2_2 = 400,
+ .coeff2_3 = 0
+};
+
+static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+
+ int32_t tmp_block[48];
+ int32_t *tmp = tmp_block;
+ int i;
+
+ set_pico_config(&h264_qpel4_hv_lowpass_config);
+
+ src -= 2;
+ for ( i = 0; i < 2; i++ ){
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(0, 0, 0, 4, 8);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src2);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+ src += 2;
+ }
+
+ src -= 1;
+ tmp -= 48;
+
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(10)
+ | PICO_OFFSET_FRAC_BITS(10));
+
+ for ( i = 0; i < 2; i++ ){
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
+
+ ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
+ ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
+
+ ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
+ ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
+
+ dst += 2;
+ src += 2;
+ }
+}
+
+
+
+
+static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+
+ int32_t tmp_block[48];
+ int32_t *tmp = tmp_block;
+ int i;
+
+ set_pico_config(&h264_qpel4_hv_lowpass_config);
+
+ src -= 2;
+ for ( i = 0; i < 2; i++ ){
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(0, 0, 0, 4, 8);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src2);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(0, 0, 4, 8, 0);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+
+ PICO_OP(0, 0, 1, 5, 9);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
+ PICO_STCM_W(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ tmp += 3;
+ src += 2;
+ }
+
+ src -= 1;
+ tmp -= 48;
+
+ PICO_PUT_W(PICO_CONFIG,
+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
+ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
+ | PICO_COEFF_FRAC_BITS(10)
+ | PICO_OFFSET_FRAC_BITS(10));
+
+ for ( i = 0; i < 2; i++ ){
+ int srcB= LD32(src - 2*srcStride);
+ int srcA= LD32(src - 1*srcStride);
+ int src0= LD32(src + 0 *srcStride);
+ int src1= LD32(src + 1 *srcStride);
+ int src2= LD32(src + 2 *srcStride);
+ int src3= LD32(src + 3 *srcStride);
+ int src4= LD32(src + 4 *srcStride);
+ int src5= LD32(src + 5 *srcStride);
+ int src6= LD32(src + 6 *srcStride);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX0, src3);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, srcB);
+ PICO_MVRC_W(PICO_INPIX1, srcA);
+ PICO_MVRC_W(PICO_INPIX2, src0);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_MVRC_W(PICO_INPIX1, src3);
+ PICO_MVRC_W(PICO_INPIX0, src4);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, srcA);
+ PICO_MVRC_W(PICO_INPIX1, src0);
+ PICO_MVRC_W(PICO_INPIX2, src1);
+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
+
+ ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
+ ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_MVRC_W(PICO_INPIX1, src4);
+ PICO_MVRC_W(PICO_INPIX0, src5);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, src0);
+ PICO_MVRC_W(PICO_INPIX1, src1);
+ PICO_MVRC_W(PICO_INPIX2, src2);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
+ PICO_MVRC_W(PICO_INPIX2, src4);
+ PICO_MVRC_W(PICO_INPIX1, src5);
+ PICO_MVRC_W(PICO_INPIX0, src6);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
+
+ PICO_LDCM_W_INC(tmp,
+ PICO_REGVECT_VMU0_OUT,
+ PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT);
+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
+ PICO_MVRC_W(PICO_INPIX0, src1);
+ PICO_MVRC_W(PICO_INPIX1, src2);
+ PICO_MVRC_W(PICO_INPIX2, src3);
+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
+
+ ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
+ ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
+
+ dst += 2;
+ src += 2;
+ }
+}
+
+
+static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+ src += 4*srcStride;
+ dst += 4*dstStride;
+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
+}
+
+
+#define H264_MC(OPNAME, SIZE) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t half[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t half[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t half[SIZE*SIZE];\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t half[SIZE*SIZE];\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfHV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t halfH[SIZE*SIZE];\
+ uint8_t halfHV[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfV[SIZE*SIZE];\
+ uint8_t halfHV[SIZE*SIZE];\
+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
+ uint8_t full[SIZE*(SIZE+5)];\
+ uint8_t * const full_mid= full + SIZE*2;\
+ uint8_t halfV[SIZE*SIZE];\
+ uint8_t halfHV[SIZE*SIZE];\
+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
+}\
+
+H264_MC(put_, 4)
+H264_MC(put_, 8)
+H264_MC(put_, 16)
+H264_MC(avg_, 4)
+H264_MC(avg_, 8)
+H264_MC(avg_, 16)
+
+
+#define dspfunc16(PFX) \
+ void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
+ PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
+ PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
+ }\
+ void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
+ PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
+ PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
+ }\
+ void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
+ PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
+ PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
+ }\
+ void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
+ PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
+ PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
+ }\
+
+
+dspfunc16(put)
+dspfunc16(put_no_rnd)
+dspfunc16(avg)
+dspfunc16(avg_no_rnd)
+#undef dspfunc16
+
+
+/* Returns zero in each byte where the absolute difference between <a> and <b>
+ is not less than <compare> */
+#define PABS_DIFF_LESS_THAN( a, b, compare) \
+ ({ uint32_t __tmp__, __tmp2__, __mask__; \
+ __asm__ ( \
+ /* Check ABS( a - b ) < compare */ \
+ "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
+ "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
+ "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
+ /* This produces 0 for all bytes where the comparison is not true */ \
+ "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
+ : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
+ : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
+ __mask__; })
+
+/*
+ Set all bytes containing zero in <value> to 255 and the rest to zero.
+
+ Add with saturation 254 to all bytes making all bytes different from
+ zero become 255. Then add one without saturation to make all bytes
+ originally containing zero 255 and the rest 0. */
+#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
+ ({ uint32_t __tmp__; \
+ __asm__ ( \
+ "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
+ "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
+ : [tmp] "=r"(__tmp__) \
+ : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
+ __tmp__; })
+
+#define PACKW_SH(upper, lower) \
+ ({ uint32_t __tmp__; \
+ __asm__ ( \
+ "packw.sh\t%[tmp], %[u], %[l]\n" \
+ : [tmp] "=r"(__tmp__) \
+ : [u] "r"(upper), [l] "r"(lower) ); \
+ __tmp__; })
+
+#define PACKSH_UB(upper, lower) \
+ ({ uint32_t __tmp__; \
+ __asm__ ( \
+ "packsh.sb\t%[tmp], %[u], %[l]\n" \
+ : [tmp] "=r"(__tmp__) \
+ : [u] "r"(upper), [l] "r"(lower) ); \
+ __tmp__; })
+
+static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+ int i;
+
+ if ( alpha == 0 )
+ return;
+
+ alpha = PACKW_SH(alpha, alpha);
+ alpha = PACKSH_UB(alpha, alpha);
+ beta = PACKW_SH(beta, beta);
+ beta = PACKSH_UB(beta, beta);
+
+ for( i = 0; i < 4; i++ ) {
+ int tc, tc0_p, tc0_m;
+ uint32_t p0, p1, p2, q0, q1, q2;
+ uint32_t old_p0, old_q0;
+ uint32_t mask, mask2;
+ uint32_t tmp, tmp2, tmp3, tmp4;
+
+ if( tc0[i] < 0 ) {
+ pix += 4;
+ continue;
+ }
+
+ p0 = LD32(pix - stride);
+ p1 = LD32(pix - 2*stride);
+ q0 = LD32(pix);
+ q1 = LD32(pix + stride);
+
+ /* Check which of the columns should be filtered, if any. */
+ mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
+ mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
+ mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
+
+ if ( !mask )
+ continue;
+
+ mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
+
+ tc = PACKW_SH(tc0[i], tc0[i]);
+ tc0_p = tc;
+ tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
+
+ /*
+ int i_delta;
+ if( ABS( p2 - p0 ) < beta ) {
+ pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
+ tc++;
+ }
+ */
+
+ p2 = LD32(pix - 3*stride);
+ mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
+
+ if ( mask2 ){
+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
+ __asm__ ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
+ "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
+ "punpckub.h\t%[tmp2], %[tmp]:t\n"
+ "punpckub.h\t%[tmp], %[tmp]:b\n"
+ "punpckub.h\t%[tmp3], %[p1]:t\n"
+ "punpckub.h\t%[tmp4], %[p1]:b\n"
+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
+ "andn\t%[tmp], %[mask2]\n"
+ "and\t%[tmp2], %[q1], %[mask2]\n"
+ "or\t%[tmp], %[tmp2]\n"
+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
+ [tmp4]"=&r"(tmp4)
+ : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
+ ST32(pix - 2*stride, tmp);
+ tc += 0x00010001;
+ }
+
+ q2 = LD32(pix + 2*stride);
+
+ /*
+ if( ABS( q2 - q0 ) < beta ) {
+ pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
+ tc++;
+ }
+ */
+ mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
+
+ if ( mask2 ){
+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
+ __asm__ ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
+ "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
+ "punpckub.h\t%[tmp2], %[tmp]:t\n"
+ "punpckub.h\t%[tmp], %[tmp]:b\n"
+ "punpckub.h\t%[tmp3], %[q1]:t\n"
+ "punpckub.h\t%[tmp4], %[q1]:b\n"
+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
+ "andn\t%[tmp], %[mask2]\n"
+ "and\t%[tmp2], %[q1], %[mask2]\n"
+ "or\t%[tmp], %[tmp2]\n"
+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
+ [tmp4]"=&r"(tmp4)
+ : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
+ ST32(pix + stride, tmp);
+ tc += 0x00010001;
+ }
+
+ old_p0 = p0;
+ old_q0 = q0;
+
+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+ pix[-stride] = clip_uint8( p0 + i_delta );
+ pix[0] = clip_uint8( q0 - i_delta );
+ */
+
+ __asm__ (
+ /* Check if the two upper pixels should be filtered */
+ "lsr\t%[tmp], %[inv_mask], 16\n"
+ "breq\t0f\n"
+
+ "punpckub.h\t%[tmp], %[p1]:t\n"
+ "punpckub.h\t%[tmp2], %[q1]:t\n"
+
+ /* p1 - q1 */
+ "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
+
+ "punpckub.h\t%[tmp3], %[q0]:t\n"
+ "punpckub.h\t%[tmp4], %[p0]:t\n"
+
+ /* q0 - p0 */
+ "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
+
+ /* (q0 - p0) << 2 */
+ "plsl.h\t%[tmp2], %[tmp2], 2\n"
+
+ /* ((q0 - p0) << 2) + (p1 - q1) */
+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
+
+ "mov\t%[tmp], 0x00040004\n"
+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
+
+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
+ "pasr.h\t%[tmp2], %[tmp2], 3\n"
+
+ "mov\t%[tmp], 0\n"
+ "psub.h\t%[tmp], %[tmp], %[tc]\n"
+
+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
+
+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
+ "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
+
+ /* pix[0] = clip_uint8( q0 - i_delta ); */
+ "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
+
+ /* Check if the two lower pixels should be filtered */
+ "lsl\t%[tmp2], %[inv_mask], 16\n"
+ "breq\t1f\n"
+
+ "0:\n"
+ "punpckub.h\t%[p1], %[p1]:b\n"
+ "punpckub.h\t%[q1], %[q1]:b\n"
+
+ /* p1 - q1 */
+ "psub.h\t%[p1], %[p1], %[q1]\n"
+
+ "punpckub.h\t%[q0], %[q0]:b\n"
+ "punpckub.h\t%[p0], %[p0]:b\n"
+
+ /* q0 - p0 */
+ "psub.h\t%[tmp2], %[q0], %[p0]\n"
+
+ /* (q0 - p0) << 2 */
+ "plsl.h\t%[tmp2], %[tmp2], 2\n"
+
+ /* ((q0 - p0) << 2) + (p1 - q1) */
+ "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
+
+ "mov\t%[q1], 0x00040004\n"
+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
+ "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
+
+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
+ "pasr.h\t%[tmp2], %[tmp2], 3\n"
+
+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
+
+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */
+ "padd.h\t%[p0], %[p0], %[tmp2]\n"
+
+ /* pix[0] = clip_uint8( q0 - i_delta ); */
+ "psub.h\t%[q0], %[q0], %[tmp2]\n"
+
+ "1:\n"
+ "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
+ "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
+
+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
+ [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
+ : [tc]"r"(tc), [inv_mask]"r"(~mask));
+
+ ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
+ ST32(pix, (mask & old_q0) | (q0 & ~mask));
+
+ }
+ pix += 1;
+}
+
+
+
+
+#ifdef CHECK_DSP_FUNCS_AGAINST_C
+
+void dump_block8(uint8_t *block, int line_size, int h){
+ int i, j;
+
+ for ( i = 0; i < h ; i++ ){
+ av_log(NULL, AV_LOG_ERROR, "\t");
+ for ( j = 0; j < 8 ; j++ ){
+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
+ }
+ av_log(NULL, AV_LOG_ERROR, "\n");
+ }
+}
+
+void dump_block4(uint8_t *block, int line_size, int h){
+ int i, j;
+
+ for ( i = 0; i < h ; i++ ){
+ av_log(NULL, AV_LOG_ERROR, "\t");
+ for ( j = 0; j < 4 ; j++ ){
+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
+ }
+ av_log(NULL, AV_LOG_ERROR, "\n");
+ }
+}
+
+void dump_block(uint8_t *block, int line_size, int h, int w){
+ int i, j;
+
+ for ( i = 0; i < h ; i++ ){
+ av_log(NULL, AV_LOG_ERROR, "\t");
+ for ( j = 0; j < w ; j++ ){
+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
+ }
+ av_log(NULL, AV_LOG_ERROR, "\n");
+ }
+}
+
+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, char *name, int max_dev){
+ int i,j;
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < h ; j++ ){
+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
+ diff = diff < 0 ? -diff : diff;
+ if ( diff > max_dev ){
+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
+ dump_block8(test, line_size_test, h);
+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
+ dump_block8(correct, line_size_correct, h);
+ exit(1);
+ }
+ }
+ }
+}
+
+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, char *name, int max_dev){
+ int i,j;
+ for ( i = 0; i < 4 ; i++ ){
+ for ( j = 0; j < h ; j++ ){
+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
+ diff = diff < 0 ? -diff : diff;
+ if ( diff > max_dev ){
+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
+ dump_block8(test, line_size_test, h);
+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
+ dump_block4(correct, line_size_correct, h);
+ exit(1);
+ }
+ }
+ }
+}
+
+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
+ int h, int width, char *name, int max_dev){
+ int i,j;
+ for ( i = 0; i < width ; i++ ){
+ for ( j = 0; j < h ; j++ ){
+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
+ diff = diff < 0 ? -diff : diff;
+ if ( diff > max_dev ){
+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
+ dump_block(test, line_size_test, h, width);
+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
+ dump_block(correct, line_size_correct, h, width);
+ exit(1);
+ }
+ }
+ }
+}
+
+void dump_dct_block(DCTELEM *block){
+ int i, j;
+
+ for ( i = 0; i < 8 ; i++ ){
+ av_log(NULL, AV_LOG_ERROR, "\t");
+ for ( j = 0; j < 8 ; j++ ){
+ av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
+ }
+ av_log(NULL, AV_LOG_ERROR, "\n");
+ }
+}
+
+void test_idct_avr32(DCTELEM *block){
+ DCTELEM testBlock[64];
+ int i, j;
+
+ /* Copy transposed block to testBlock */
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < 8 ; j++ ){
+ testBlock[i + 8*j] = block[j + i*8];
+ }
+ }
+
+ idct_avr32(block);
+ simple_idct(&testBlock);
+
+ for ( i = 0; i < 64 ; i++ ){
+ if ( block[i] != testBlock[i] ){
+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
+ dump_dct_block(block);
+ av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
+ dump_dct_block(testBlock);
+ exit(1);
+ }
+ }
+}
+
+void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
+ uint8_t testBlock[64];
+ DCTELEM blockCopy[64];
+ int i, j;
+
+ /* Copy transposed block to blockCopy */
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < 8 ; j++ ){
+ blockCopy[i + 8*j] = block[j + i*8];
+ }
+ }
+
+ idct_put_avr32(dest, line_size, block);
+ simple_idct_put(&testBlock, 8, blockCopy);
+
+ check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
+}
+
+
+void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
+ uint8_t testBlock[64];
+ DCTELEM blockCopy[64];
+ int i, j;
+
+ /* Copy dest to testBlock */
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < 8 ; j++ ){
+ testBlock[i + 8*j] = dest[i + j*line_size];
+ }
+ }
+
+ /* Copy transposed block to blockCopy */
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < 8 ; j++ ){
+ blockCopy[i + 8*j] = block[j + i*8];
+ }
+ }
+
+ idct_add_avr32(dest, line_size, block);
+ simple_idct_add(&testBlock, 8, blockCopy);
+
+ check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
+}
+
+void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
+ uint8_t testBlock[16];
+ DCTELEM blockCopy[16];
+ int i, j;
+
+ /* Copy dest to testBlock */
+ for ( i = 0; i < 4 ; i++ ){
+ for ( j = 0; j < 4 ; j++ ){
+ testBlock[i + 4*j] = dest[i + j*stride];
+ }
+ }
+
+ /* Copy transposed block to blockCopy */
+ for ( i = 0; i < 16 ; i++ ){
+ blockCopy[i] = block[i];
+ }
+
+ ff_h264_idct_add_c(dest, block, stride);
+
+ h264_idct_add_avr32(testBlock, blockCopy, 4);
+
+ check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
+}
+
+void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
+ uint8_t testBlock[8*8];
+ DCTELEM blockCopy[8*8];
+ int i, j;
+
+ /* Copy dest to testBlock */
+ for ( i = 0; i < 8 ; i++ ){
+ for ( j = 0; j < 8 ; j++ ){
+ testBlock[i + 8*j] = dest[i + j*stride];
+ }
+ }
+
+ /* Copy source block to blockCopy */
+ for ( i = 0; i < 8*8 ; i++ ){
+ blockCopy[i] = block[i];
+ }
+
+ ff_h264_idct8_add_c(dest, block, stride);
+ h264_idct8_add_avr32(testBlock, blockCopy, 8);
+
+ check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
+}
+
+void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
+ const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
+ uint8_t *testBlock, *testBlock2;
+ int i, j;
+ int input_v_size = h + in_v_size;
+ int input_h_size = 8 + in_h_size;
+
+ testBlock = alloca(input_h_size*input_v_size);
+ testBlock2 = alloca(input_h_size*input_v_size);
+
+ for ( i = 0; i < input_h_size ; i++ ){
+ for ( j = 0; j < input_v_size ; j++ ){
+ testBlock[i + input_h_size*j] = pixels[i + j*line_size];
+ }
+ }
+
+ test(block, pixels, line_size, h);
+ correct(testBlock2, testBlock, input_h_size, h);
+
+ check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
+
+}
+
+void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
+ uint8_t *src, int stride, int h, int w, int x, int y, char *name){
+ uint8_t *testBlock, *testBlock2;
+ int i, j;
+ int input_v_size = h + 1;
+ int input_h_size = ((w + 1) + 3) & ~3;
+
+ testBlock = alloca(input_h_size*input_v_size);
+ testBlock2 = alloca(input_h_size*input_v_size);
+
+ for ( i = 0; i < w + 1 ; i++ ){
+ for ( j = 0; j < h + 1 ; j++ ){
+ testBlock[i + input_h_size*j] = src[i + j*stride];
+ }
+ }
+
+ for ( i = 0; i < w ; i++ ){
+ for ( j = 0; j < h ; j++ ){
+ testBlock2[i + input_h_size*j] = dst[i + j*stride];
+ }
+ }
+
+ test(dst, src, stride, h, x, y);
+ correct(testBlock2, testBlock, input_h_size, h, x, y);
+
+ check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
+
+}
+
+void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
+ uint8_t *src, int stride, int size, char *name){
+ uint8_t *testBlock, *testBlock2;
+ int i, j;
+ int test_stride = size + 8;
+
+ testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
+ testBlock2 = alloca(test_stride*size);
+
+ for ( i = -4; i < size+4 ; i++ ){
+ for ( j = -4; j < size+4 ; j++ ){
+ testBlock[i + test_stride*j] = src[i + j*stride];
+ }
+ }
+
+ for ( i = 0; i < size ; i++ ){
+ for ( j = 0; j < size ; j++ ){
+ testBlock2[i + test_stride*j] = dst[i + j*stride];
+ }
+ }
+
+ correct(dst, src, stride);
+ test(testBlock2, testBlock, test_stride);
+
+ check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
+
+}
+
+
+#define test_pixels_funcs(PFX, NUM ) \
+void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
+ block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
+void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
+ block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
+void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
+ block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
+void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
+ block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
+
+test_pixels_funcs(put, 8);
+test_pixels_funcs(put_no_rnd, 8);
+test_pixels_funcs(put, 16);
+test_pixels_funcs(put_no_rnd, 16);
+
+test_pixels_funcs(avg, 8);
+test_pixels_funcs(avg_no_rnd, 8);
+test_pixels_funcs(avg, 16);
+test_pixels_funcs(avg_no_rnd, 16);
+
+#define test_h264_chroma_mc_funcs(PFX, NUM ) \
+void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
+ test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
+ dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
+
+test_h264_chroma_mc_funcs(put, 2);
+test_h264_chroma_mc_funcs(put, 4);
+test_h264_chroma_mc_funcs(put, 8);
+test_h264_chroma_mc_funcs(avg, 2);
+test_h264_chroma_mc_funcs(avg, 4);
+test_h264_chroma_mc_funcs(avg, 8);
+
+#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
+void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
+ test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
+ dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
+
+#define test_qpel_mc_funcs(PFX, NUM) \
+ test_qpel_mc_funcs_type(PFX, NUM, mc00);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc10);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc20);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc30);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc01);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc11);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc21);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc31);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc02);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc12);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc22);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc32);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc03);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc13);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc23);\
+ test_qpel_mc_funcs_type(PFX, NUM, mc33)
+
+test_qpel_mc_funcs(put_h264_qpel, 4);
+test_qpel_mc_funcs(put_h264_qpel, 8);
+test_qpel_mc_funcs(put_h264_qpel, 16);
+test_qpel_mc_funcs(avg_h264_qpel, 4);
+test_qpel_mc_funcs(avg_h264_qpel, 8);
+test_qpel_mc_funcs(avg_h264_qpel, 16);
+
+
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
+
+#endif
+
+void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
+{
+
+ /* H264 */
+
+ if ( 0 /*avr32_use_pico*/ ){
+ c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
+ c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
+ c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
+
+ c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
+ c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
+ c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
+ }
+
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
+
+ if ( avr32_use_pico ){
+ dspfunc(put_h264_qpel, 0, 16);
+ dspfunc(put_h264_qpel, 1, 8);
+ dspfunc(put_h264_qpel, 2, 4);
+ dspfunc(avg_h264_qpel, 0, 16);
+ dspfunc(avg_h264_qpel, 1, 8);
+ dspfunc(avg_h264_qpel, 2, 4);
+ }
+
+ c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
+ c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
+ c->idct = DSP_FUNC_NAME(idct_avr32);
+ c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
+ c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
+
+ /*
+ c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;
+ */
+
+ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
+
+ c->fdct = fdct_avr32;
+
+ c->clear_blocks = clear_blocks_avr32;
+
+#undef dspfunc
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
+ c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
+ c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
+ c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
+
+ dspfunc(put, 0, 16);
+ dspfunc(put_no_rnd, 0, 16);
+ dspfunc(put, 1, 8);
+ dspfunc(put_no_rnd, 1, 8);
+
+ dspfunc(avg, 1, 8);
+ dspfunc(avg_no_rnd, 1, 8);
+ dspfunc(avg, 0, 16);
+ dspfunc(avg_no_rnd, 0, 16);
+#undef dspfunc
+
+}
+
diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
new file mode 100644
index 0000000..ceede08
--- /dev/null
+++ b/libavcodec/avr32/fdct.S
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+//**********************************************************
+//* 2-D fDCT, Based on: *
+//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
+//* Fast 1-D DCT Algorithms with 11 Multiplications", *
+//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
+//* Processing 1989 (ICASSP '89), pp. 988-991. *
+//* *
+//* Fixed point implementation optimized for the AVR32 *
+//* instruction set. If a table is used for the *
+//* coeffisients we can load two and two of them from *
+//* This will give a reduction of *
+//* *
+//* *
+//**********************************************************
+
+
+/* This routine is a slow-but-accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). Taken from the IJG software
+ *
+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+ * on each column. Direct algorithms are also available, but they are
+ * much more complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on an algorithm described in
+ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+ * The primary algorithm described there uses 11 multiplies and 29 adds.
+ * We use their alternate method with 12 multiplies and 32 adds.
+ * The advantage of this method is that no data path contains more than one
+ * multiplication; this allows a very simple and accurate implementation in
+ * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * The poop on this scaling stuff is as follows:
+ *
+ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+ * larger than the true DCT outputs. The final outputs are therefore
+ * a factor of N larger than desired; since N=8 this can be cured by
+ * a simple right shift at the end of the algorithm. The advantage of
+ * this arrangement is that we save two multiplications per 1-D DCT,
+ * because the y0 and y4 outputs need not be divided by sqrt(N).
+ * In the IJG code, this factor of 8 is removed by the quantization step
+ * (in jcdctmgr.c), here it is removed.
+ *
+ * We have to do addition and subtraction of the integer inputs, which
+ * is no problem, and multiplication by fractional constants, which is
+ * a problem to do in integer arithmetic. We multiply all the constants
+ * by CONST_SCALE and convert them to integer constants (thus retaining
+ * CONST_BITS bits of precision in the constants). After doing a
+ * multiplication we have to divide the product by CONST_SCALE, with proper
+ * rounding, to produce the correct output. This division can be done
+ * cheaply as a right shift of CONST_BITS bits. We postpone shifting
+ * as long as possible so that partial sums can be added together with
+ * full fractional precision.
+ *
+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
+ * they are represented to better-than-integral precision. These outputs
+ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
+ * with the recommended scaling. (For 12-bit sample data, the intermediate
+ * array is INT32 anyway.)
+ *
+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
+ * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
+ * shows that the values given below are the most effective.
+ *
+ * We can gain a little more speed, with a further compromise in accuracy,
+ * by omitting the addition in a descaling shift. This yields an incorrectly
+ * rounded result half the time...
+ */
+
+ .global fdct_avr32
+
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define FIX_0_298631336 2446 /* FIX(0.298631336) */
+#define FIX_0_390180644 3196 /* FIX(0.390180644) */
+#define FIX_0_541196100 4433 /* FIX(0.541196100) */
+#define FIX_0_765366865 6270 /* FIX(0.765366865) */
+#define FIX_0_899976223 7373 /* FIX(0.899976223) */
+#define FIX_1_175875602 9633 /* FIX(1.175875602) */
+#define FIX_1_501321110 12299 /* FIX(1.501321110) */
+#define FIX_1_847759065 15137 /* FIX(1.847759065) */
+#define FIX_1_961570560 16069 /* FIX(1.961570560) */
+#define FIX_2_053119869 16819 /* FIX(2.053119869) */
+#define FIX_2_562915447 20995 /* FIX(2.562915447) */
+#define FIX_3_072711026 25172 /* FIX(3.072711026) */
+
+
+/*
+ * Perform an integer forward DCT on one block of samples.
+ */
+
+//void
+//fdct_int32(short *const block)
+//{
+// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+// int tmp10, tmp11, tmp12, tmp13;
+// int z1, z2, z3, z4, z5;
+// short *blkptr;
+// int *dataptr;
+// int data[64];
+// int i;
+//
+// /* Pass 1: process rows. */
+// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+// /* furthermore, we scale the results by 2**PASS1_BITS. */
+//
+// dataptr = data;
+// blkptr = block;
+
+ .text
+fdct_avr32:
+ pushm r0-r3, r4-r7, lr
+#define loop_ctr r0
+#define blkptr r12
+#define x0 r1
+#define x1 r2
+#define x2 r3
+#define x3 r4
+#define x4 r5
+#define x5 r6
+#define x6 r7
+#define x7 r8
+#define tmp0 r5
+#define tmp7 r2
+#define tmp1 r3
+#define tmp6 r4
+#define tmp2 r9
+#define tmp5 r8
+#define tmp3 r7
+#define tmp4 r6
+
+
+ mov loop_ctr, 8
+// for (i = 0; i < 8; i++) {
+ROW_LOOP:
+
+ ldm blkptr, r1, r2, r3, r4
+
+// tmp2 = blkptr[2] + blkptr[5];
+// tmp3 = blkptr[3] + blkptr[4];
+ paddx.h r5, r3, r2
+// tmp5 = blkptr[2] - blkptr[5];
+// tmp4 = blkptr[3] - blkptr[4];
+ psubx.h r6, r3, r2
+// tmp0 = blkptr[0] + blkptr[7];
+// tmp1 = blkptr[1] + blkptr[6];
+ paddx.h r2, r4, r1
+// tmp7 = blkptr[0] - blkptr[7];
+// tmp6 = blkptr[1] - blkptr[6];
+ psubx.h r3, r4, r1
+
+// /* Even part per LL&M figure 1 --- note that published figure is faulty;
+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+// */
+
+#define tmp10 r1
+#define tmp13 r5
+#define tmp11 r7
+#define tmp12 r3
+#define z1 r9
+
+// tmp10 = tmp0 + tmp3;
+// tmp13 = tmp0 - tmp3;
+ paddsub.h r1, r2:t, r5:b
+// tmp11 = tmp1 + tmp2;
+// tmp12 = tmp1 - tmp2;
+ paddsub.h r4, r2:b, r5:t
+
+
+// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
+// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
+ paddsub.h r7, r1:t, r4:t
+ ld.w r10, pc[const_table - .]
+ plsl.h r7, r7, PASS1_BITS
+
+// z1 = (tmp12 + tmp13) * FIX_0_541196100;
+ addhh.w r8, r4:b, r1:b
+ mulhh.w r8, r8:b, r10:t
+
+// dataptr[2] =
+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
+// dataptr[6] =
+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
+ mulhh.w r9, r1:b, r10:b
+ ld.w r10, pc[const_table - . + 4]
+ add r1, r8, r9
+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
+
+ mulhh.w r9, r4:b, r10:t
+ add r4, r8, r9
+ satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
+
+
+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+// * cK represents cos(K*pi/16).
+// * i0..i3 in the paper are tmp4..tmp7 here.
+// */
+
+#define z2 r5
+#define z3 r6
+#define z4 r7
+#define z5 r8
+
+// z4 = tmp5 + tmp7;
+// z3 = tmp4 + tmp6;
+ padd.h r2, r6, r3
+// z2 = tmp5 + tmp6;
+// z1 = tmp4 + tmp7;
+ paddx.h r5, r6, r3
+
+ lddpc r9, pc[const_table - . + 8]
+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+ addhh.w r8, r2:t, r2:b
+ mulhh.w r8, r8:b, r10:b
+ lddpc r10, pc[const_table - . + 12]
+
+
+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+ mulhh.w r11, r6:b, r9:t
+
+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+ mulhh.w r6, r6:t, r9:b
+
+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+ lddpc r9, pc[const_table - . + 20]
+ mulhh.w lr, r3:b, r10:t
+
+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+ mulhh.w r3, r3:t, r10:b
+
+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
+ mulhh.w r10, r2:b, r9:t
+
+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
+ mulhh.w r2, r2:t, r9:b
+ lddpc r9, pc[const_table - . + 16]
+// z3 += z5;
+// z4 += z5;
+ add r10, r8
+ add r2, r8
+
+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
+ mulhh.w r8, r5:b, r9:t
+
+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
+ mulhh.w r5, r5:t, r9:b
+
+// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+ add r11, r8
+ add r11, r10
+ satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
+
+// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+ add r6, r5
+
+ sthh.w blkptr[6*2], r4:b, r11:b
+ add r6, r2
+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
+
+// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+ add lr, r5
+ sthh.w blkptr[4*2], r7:b, r6:b
+ add lr, r10
+ satrnds lr >> (CONST_BITS - PASS1_BITS), 31
+
+// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+ add r3, r8
+ sthh.w blkptr[2*2], r1:b, lr:b
+ add r3, r2
+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
+
+
+
+// dataptr += 8; /* advance pointer to next row */
+// blkptr += 8;
+ sthh.w blkptr[0], r7:t, r3:b
+ sub blkptr, -16
+ sub loop_ctr, 1
+ brne ROW_LOOP
+
+// }
+
+ /* Pass 2: process columns.
+ * We remove the PASS1_BITS scaling, but leave the results scaled up
+ * by an overall factor of 8.
+ */
+
+// dataptr = data;
+ sub blkptr, 128
+
+ mov loop_ctr, 4
+// for (i = 0; i < 8; i++) {
+COLOUMN_LOOP:
+ ld.w r1, blkptr[0]
+ ld.w r2, blkptr[1*8*2]
+ ld.w r3, blkptr[2*8*2]
+ ld.w r4, blkptr[3*8*2]
+ ld.w r5, blkptr[4*8*2]
+ ld.w r6, blkptr[5*8*2]
+ ld.w r7, blkptr[6*8*2]
+ ld.w r8, blkptr[7*8*2]
+
+// tmp0 = blkptr[0] + blkptr[7*8];
+ padds.sh r9, r1, r8
+// tmp7 = blkptr[0] - blkptr[7*8];
+ psubs.sh r1, r1, r8
+// tmp1 = blkptr[1*8] + blkptr[6*8];
+ padds.sh r8, r2, r7
+// tmp6 = blkptr[1*8] - blkptr[6*8];
+ psubs.sh r2, r2, r7
+// tmp2 = blkptr[2*8] + blkptr[5*8];
+ padds.sh r7, r3, r6
+// tmp5 = blkptr[2*8] - blkptr[5*8];
+ psubs.sh r3, r3, r6
+// tmp3 = blkptr[3*8] + blkptr[4*8];
+ padds.sh r6, r4, r5
+// tmp4 = blkptr[3*8] - blkptr[4*8];
+ psubs.sh r4, r4, r5
+
+// /* even part per ll&m figure 1 --- note that published figure is faulty;
+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+// */
+//
+// tmp10 = tmp0 + tmp3;
+ padds.sh r5, r9, r6
+// tmp13 = tmp0 - tmp3;
+ psubs.sh r9, r9, r6
+// tmp11 = tmp1 + tmp2;
+ padds.sh r6, r8, r7
+// tmp12 = tmp1 - tmp2;
+ psubs.sh r8, r8, r7
+
+// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
+// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
+//Might get an overflow here
+ padds.sh r7, r5, r6
+ psubs.sh r5, r5, r6
+
+ //Rounding
+ mov lr, (1 << (PASS1_BITS + 2))
+ orh lr, hi(1 << (16 + PASS1_BITS + 2))
+ padds.sh r7, r7, lr
+ padds.sh r5, r5, lr
+
+ pasr.h r7, r7, PASS1_BITS + 3
+ pasr.h r5, r5, PASS1_BITS + 3
+ st.w r12[0], r7
+ st.w r12[4*8*2], r5
+
+ lddpc r10, const_table2
+
+
+// z1 = (tmp12 + tmp13) * FIX_0_541196100;
+ padds.sh r5, r8, r9
+ mulhh.w r6, r5:t, r10:t
+ mulhh.w r7, r5:b, r10:t
+
+// dataptr[16] =
+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
+ lddpc r11, const_table2 + 4
+ mulhh.w lr, r9:t, r10:b
+ mulhh.w r9, r9:b, r10:b
+ add lr, r6
+ add r9, r7
+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[2*8*2], lr:b, r9:b
+
+// dataptr[48] =
+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
+ mulhh.w lr, r8:t, r11:t
+ mulhh.w r8, r8:b, r11:t
+ add lr, r6
+ add r8, r7
+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[6*8*2], lr:b, r8:b
+
+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+// * cK represents cos(K*pi/16).
+// * i0..i3 in the paper are tmp4..tmp7 here.
+// */
+//
+// z2 = tmp5 + tmp6;
+// z3 = tmp4 + tmp6;
+// z4 = tmp5 + tmp7;
+ padds.sh r5, r3, r2
+ padds.sh r6, r4, r2
+ padds.sh r7, r3, r1
+
+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+ padds.sh r8, r6, r7
+ mulhh.w r9, r8:t, r11:b
+ mulhh.w r8, r8:b, r11:b
+
+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
+// z3 += z5;
+ lddpc r11, const_table2 + 8
+ mulhh.w r10, r6:t, r11:t
+ mulhh.w r6, r6:b, r11:t
+ add r10, r9
+ add r6, r8
+
+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
+// z4 += z5;
+ mulhh.w lr, r7:t, r11:b
+ mulhh.w r7, r7:b, r11:b
+ lddpc r11, const_table2 + 12
+ st.w --sp,r0
+ add lr, r9
+ add r7, r8
+
+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+ mulhh.w r0, r2:t, r11:t
+ machh.w r0, r5:t, r11:b
+ mulhh.w r2, r2:b, r11:t
+ machh.w r2, r5:b, r11:b
+
+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
+// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
+ add r0, r10
+ lddpc r11, const_table2 + 16
+ add r2, r6
+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[3*8*2], r0:b, r2:b
+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+ mulhh.w r0, r3:t, r11:t
+ machh.w r0, r5:t, r11:b
+ mulhh.w r2, r3:b, r11:t
+ machh.w r2, r5:b, r11:b
+ add r0, lr
+ lddpc r11, const_table2 + 20
+ add r2, r7
+
+// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[5*8*2], r0:b, r2:b
+
+
+// z1 = tmp4 + tmp7;
+ padds.sh r2, r4, r1
+
+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+ mulhh.w r3, r4:t, r11:t
+ machh.w r3, r2:t, r11:b
+ mulhh.w r4, r4:b, r11:t
+ machh.w r4, r2:b, r11:b
+ add r3, r10
+ lddpc r11, const_table2 + 24
+ add r4, r6
+
+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
+// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[7*8*2], r3:b, r4:b
+
+
+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+ mulhh.w r3, r1:t, r11:t
+ machh.w r3, r2:t, r11:b
+ mulhh.w r4, r1:b, r11:t
+ machh.w r4, r2:b, r11:b
+ add r3, lr
+ add r4, r7
+
+// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
+ sthh.w r12[1*8*2], r3:b, r4:b
+ ld.w r0, sp++
+
+// dataptr++; /* advance pointer to next column */
+ sub blkptr, -4
+ sub loop_ctr, 1
+ brne COLOUMN_LOOP
+
+// }
+
+ popm r0-r3, r4-r7, pc
+
+// /* descale */
+// for (i = 0; i < 64; i++)
+// block[i] = (short int) DESCALE(data[i], 3);
+
+
+//}
+
+
+ .align 2
+const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
+ .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
+ .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
+
+const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
+ .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
+ .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
+ .short FIX_1_501321110, -FIX_0_899976223
+
diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
new file mode 100644
index 0000000..97009ee
--- /dev/null
+++ b/libavcodec/avr32/h264idct.S
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+ .global h264_idct_add_avr32
+
+ /* Macro for performing the 1-D transform on one row line.
+
+ The register 'w01' should contain the first two pixels,
+ and the register 'w23' should contain the last two pixels
+ in the line. The resulting line is placed in p01 and p23
+ so that { w01, w23 } = { x0, x1, x3, x2 }.
+ 'tmp' and 'tmp2' should be scratchpad registers. */
+ .macro transform_row w01, w23, tmp, tmp2
+ add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
+ sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
+ bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
+ pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
+ paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
+ padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
+ psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
+ .endm
+
+ /* Macro for performing the 1-D transform on two columns.
+
+ The registers w0, w1, w2, w3 should each contain two
+ packed samples from the two colomns to transform.
+ tmp and tmp2 are scratchpad registers.
+
+ The resulting transformed columns are placed in the
+ same positions as the input columns.
+ */
+ .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
+ padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
+ psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
+ pasr.h \w2, \w1, 1 /* w2 = w1/2 */
+ pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
+ psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
+ padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
+ padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
+ psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
+ padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
+ psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
+ /* Scale down result. */
+ pasr.h \w0, \w0, 6
+ pasr.h \w1, \w1, 6
+ pasr.h \w2, \w2, 6
+ pasr.h \w3, \w3, 6
+ .endm
+
+/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
+
+h264_idct_add_avr32:
+
+ stm --sp,r0-r3,r4-r7, lr
+
+ /* Setup rounding factor. */
+ mov r0, (1 << 5)
+ lsl r0, 16
+
+ /* Load block */
+ ldm r11,r2-r9
+ /* r9 = { w00, w01 },
+ r8 = { w02, w03 },
+ r7 = { w10, w11 },
+ r6 = { w12, w13 },
+ r5 = { w20, w21 },
+ r4 = { w22, w23 },
+ r3 = { w30, w31 },
+ r2 = { w32, w33 } */
+
+ /* Add the rounding factor to w00. */
+ add r9, r0
+
+ /* Transform rows */
+ transform_row r9, r8, r0, r1
+ transform_row r7, r6, r0, r1
+ transform_row r5, r4, r0, r1
+ transform_row r3, r2, r0, r1
+
+ /* Transform columns */
+ transform_2columns r9, r7, r5, r3, r0, r1
+ transform_2columns r8, r6, r4, r2, r0, r1
+
+ /* Load predicted pixels.*/
+ ld.w lr, r12[0]
+ ld.w r11, r12[r10]
+
+ /* Unpack to halwords. */
+ punpckub.h r0, lr:t
+ punpckub.h r1, lr:b
+
+ /* Add with transformed row. */
+ padd.h r0, r0, r9
+ paddx.h r1, r1, r8
+ /* Pack and saturate back to 8-bit pixels. */
+ packsh.ub r0, r0, r1
+
+ /* Unpack to halwords. */
+ punpckub.h lr, r11:t
+ punpckub.h r11, r11:b
+
+ /* Add with transformed row. */
+ padd.h lr, lr, r7
+ paddx.h r11, r11, r6
+ /* Pack and saturate back to 8-bit pixels. */
+ packsh.ub r1, lr, r11
+
+ /* Store back to frame. */
+ st.w r12[0], r0
+ st.w r12[r10], r1
+
+ add r12, r12, r10 << 1
+
+ /* Load predicted pixels.*/
+ ld.w lr, r12[0]
+ ld.w r11, r12[r10]
+
+ /* Unpack to halwords. */
+ punpckub.h r0, lr:t
+ punpckub.h r1, lr:b
+
+ /* Add with transformed row. */
+ padd.h r0, r0, r5
+ paddx.h r1, r1, r4
+ /* Pack and saturate back to 8-bit pixels. */
+ packsh.ub r0, r0, r1
+
+ /* Unpack to halwords. */
+ punpckub.h lr, r11:t
+ punpckub.h r11, r11:b
+
+ /* Add with transformed row. */
+ padd.h lr, lr, r3
+ paddx.h r11, r11, r2
+ /* Pack and saturate back to 8-bit pixels. */
+ packsh.ub r1, lr, r11
+
+ /* Store back to frame. */
+ st.w r12[0], r0
+ st.w r12[r10], r1
+
+ ldm sp++,r0-r3,r4-r7, pc
+
+
+ .global h264_idct8_add_avr32
+//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
+
+h264_idct8_add_avr32:
+ stm --sp,r0-r3,r4-r7, lr
+
+ /* Push dst and stride on stack */
+ stm --sp,r10,r12
+
+// int i;
+// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
+// uint8_t *cm = cropTbl + MAX_NEG_CROP;
+
+// block[0] += 32;
+
+
+// for( i = 0; i < 8; i++ )
+// {
+ mov lr, 4
+0:
+ ld.w r7, r11[0*(8*2)]
+ ld.w r6, r11[1*(8*2)]
+ ld.w r5, r11[2*(8*2)]
+ ld.w r4, r11[3*(8*2)]
+ ld.w r3, r11[4*(8*2)]
+ ld.w r2, r11[5*(8*2)]
+ ld.w r1, r11[6*(8*2)]
+ ld.w r0, r11[7*(8*2)]
+
+/*
+ const int a0 = src[0][i] + src[4][i];
+ const int a2 = src[0][i] - src[4][i];
+ const int a4 = (src[2][i]>>1) - src[6][i];
+ const int a6 = (src[6][i]>>1) + src[2][i];
+*/
+ padd.h r8, r7, r3 /* r8 = a0 */
+ psub.h r7, r7, r3 /* r7 = a2 */
+ pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
+ pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
+ psub.h r3, r3, r1 /* r3 = a4 */
+ padd.h r9, r9, r5 /* r9 = a6 */
+
+/*
+ const int b0 = a0 + a6;
+ const int b2 = a2 + a4;
+ const int b4 = a2 - a4;
+ const int b6 = a0 - a6;
+*/
+ padd.h r1, r8, r9 /* r1 = b0 */
+ psub.h r8, r8, r9 /* r8 = b6 */
+ padd.h r5, r7, r3 /* r5 = b2 */
+ psub.h r7, r7, r3 /* r7 = b4 */
+
+/*
+ const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
+ const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
+ const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
+ const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
+*/
+ pasr.h r3, r0, 1
+ padd.h r3, r3, r0
+ psub.h r3, r2, r3
+ psub.h r3, r3, r4 /* r3 = a1 */
+
+ pasr.h r9, r4, 1
+ padd.h r9, r9, r4
+ psub.h r9, r0, r9
+ padd.h r9, r6, r9 /* r9 = a3 */
+
+ pasr.h r10, r2, 1
+ padd.h r10, r10, r2
+ padd.h r10, r10, r0
+ psub.h r10, r10, r6 /* r10 = a5 */
+
+ pasr.h r0, r6, 1
+ padd.h r0, r0, r6
+ padd.h r0, r0, r2
+ padd.h r0, r0, r4 /* r0 = a7 */
+
+/*
+ const int b1 = (a7>>2) + a1;
+ const int b3 = a3 + (a5>>2);
+ const int b5 = (a3>>2) - a5;
+ const int b7 = a7 - (a1>>2);
+*/
+ pasr.h r2, r0, 2
+ padd.h r2, r2, r3 /* r2 = b1 */
+ pasr.h r3, r3, 2
+ psub.h r3, r0, r3 /* r3 = b7 */
+
+ pasr.h r0, r10, 2
+ padd.h r0, r0, r9 /* r0 = b3 */
+ pasr.h r9, r9, 2
+ psub.h r9, r9, r10 /* r9 = b5 */
+
+/*
+ src[0][i] = b0 + b7;
+ src[7][i] = b0 - b7;
+ src[1][i] = b2 + b5;
+ src[6][i] = b2 - b5;
+ src[2][i] = b4 + b3;
+ src[5][i] = b4 - b3;
+ src[3][i] = b6 + b1;
+ src[4][i] = b6 - b1;
+*/
+
+ padd.h r4, r1, r3
+ psub.h r1, r1, r3
+ st.w r11[0*(8*2)], r4
+ st.w r11[7*(8*2)], r1
+
+ padd.h r3, r5, r9
+ psub.h r5, r5, r9
+ st.w r11[1*(8*2)], r3
+ st.w r11[6*(8*2)], r5
+
+ padd.h r9, r7, r0
+ psub.h r7, r7, r0
+ st.w r11[2*(8*2)], r9
+ st.w r11[5*(8*2)], r7
+
+ padd.h r0, r8, r2
+ psub.h r8, r8, r2
+ st.w r11[3*(8*2)], r0
+ st.w r11[4*(8*2)], r8
+
+ sub r11, -4
+ sub lr, 1
+ brne 0b
+
+// }
+
+ lddsp r12, sp[0] /* r12 = dst */
+ sub r11, 4*4
+ ldm r11++, r4-r7
+ mov lr, 8
+ /* Push dst and stride on stack */
+
+1:
+// for( i = 0; i < 8; i++ )
+// {
+
+ /*
+ r7 = {src[i][0], src[i][1]}
+ r6 = {src[i][2], src[i][3]}
+ r5 = {src[i][4], src[i][5]}
+ r4 = {src[i][6], src[i][7]}
+ */
+
+/*
+ const int a0 = src[i][0] + src[i][4];
+ const int a2 = src[i][0] - src[i][4];
+ const int a4 = (src[i][2]>>1) - src[i][6];
+ const int a6 = (src[i][6]>>1) + src[i][2];
+*/
+ pasr.h r8, r6, 1
+ pasr.h r9, r4, 1
+ addhh.w r0, r7:t, r5:t /* r0 = a0 */
+ subhh.w r1, r7:t, r5:t /* r1 = a2 */
+ subhh.w r2, r8:t, r4:t /* r2 = a4 */
+ addhh.w r3, r9:t, r6:t /* r3 = a6 */
+
+/*
+ const int b0 = a0 + a6;
+ const int b2 = a2 + a4;
+ const int b4 = a2 - a4;
+ const int b6 = a0 - a6;
+*/
+ add r10, r0, r3 /* r10 = b0 */
+ sub r0, r3 /* r0 = b6 */
+ add r3, r1, r2 /* r3 = b2 */
+ sub r1, r2 /* r1 = b4 */
+
+/*
+ const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
+ const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
+ const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
+ const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1);
+*/
+ addhh.w r8, r8:b, r6:b
+ addhh.w r2, r4:b, r7:b
+ sub r2, r8 /* r2 = a3 */
+
+ addhh.w r9, r9:b, r4:b
+ subhh.w r8, r5:b, r6:b
+ sub r8, r9 /* r8 = a1 */
+
+ pasr.h r9, r7, 1
+ addhh.w r9, r9:b, r7:b
+ addhh.w r6, r5:b, r6:b
+ add r6, r9 /* r6 = a7 */
+
+ pasr.h r9, r5, 1
+ addhh.w r9, r9:b, r5:b
+ subhh.w r5, r4:b, r7:b
+ add r5, r9 /* r5 = a5 */
+
+/* const int b1 = (a7>>2) + a1;
+ const int b3 = (a5>>2) + a3;
+ const int b5 = (a3>>2) - a5;
+ const int b7 = -(a1>>2) + a7;
+*/
+ asr r4, r6, 2
+ add r4, r8 /* r4 = b1 */
+ asr r8, 2
+ rsub r8, r6 /* r8 = b7 */
+
+ asr r6, r5, 2
+ add r6, r2 /* r6 = b3 */
+ asr r2, 2
+ sub r2, r5 /* r2 = b5 */
+
+/*
+ dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
+ dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
+ dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
+ dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
+ dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
+ dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
+ dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
+ dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
+*/
+ add r5, r10, r8
+ satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
+ sub r10, r8
+ satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
+ add r8, r3, r2
+ satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
+ sub r3, r2
+ satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
+
+ add r2, r1, r6
+ satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
+ sub r1, r6
+ satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
+
+ add r6, r0, r4
+ satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
+ sub r0, r4
+ satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
+
+ ld.w r4, r12[0]
+
+ packw.sh r8, r5, r8
+ packw.sh r7, r2, r6
+ ld.w r9, r12[4]
+ packw.sh r6, r0, r1
+ packw.sh r5, r3, r10
+
+ punpckub.h r10, r4:t
+ punpckub.h r4, r4:b
+ punpckub.h r3, r9:t
+ punpckub.h r9, r9:b
+
+ padd.h r8, r8, r10
+ padd.h r7, r7, r4
+ padd.h r6, r6, r3
+ padd.h r5, r5, r9
+
+ lddsp r10, sp[4] /* r10 = stride */
+ packsh.ub r0, r8, r7
+ packsh.ub r1, r6, r5
+
+ st.w r12[0], r0
+ st.w r12[4], r1
+
+ ldm r11++, r4-r7
+ add r12, r10 /* dst += stride */
+
+ sub lr, 1
+ brne 1b
+
+ sub sp, -8
+ ldm sp++,r0-r3,r4-r7, pc
+// }
+//}
+
diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
new file mode 100644
index 0000000..19251cb
--- /dev/null
+++ b/libavcodec/avr32/idct.S
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+ .global idct_add_avr32
+ .global idct_put_avr32
+ .global idct_avr32
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define ONE ((INT32) 1)
+
+#define CONST_SCALE (ONE << CONST_BITS)
+
+#define LINE_SIZE 32
+
+#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
+#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
+#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
+#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
+#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
+#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
+#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
+#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
+#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
+#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
+#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
+#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
+
+
+#define loop_cnt r11
+
+ .text
+
+idct_add_avr32:
+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
+
+ // Give room for some variables on the stack
+ sub sp, 8
+ stdsp SP[0], r12 // rfp
+ stdsp SP[4], r11 // iinc
+
+ mov loop_cnt, 8 //Initialize loop counter
+
+FOR_ROW:
+
+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
+ mov r6, 0
+#ifdef USE_PREFETCH
+ pref r10[LINE_SIZE] //Prefetch next line
+#endif
+ or r4, r2, r3 << 16
+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
+ or r4, r0
+ brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
+
+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
+ plsl.h r5, r5, PASS1_BITS
+ mov r4, r5
+ st.d r10++, r4
+ st.d r10++, r4
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
+
+ bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
+
+
+AC_ROW:
+
+
+ ld.w r12, pc[coef_table - .]
+ ld.w r9, pc[coef_table - . + 4]
+
+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
+ mulhh.w r5, r4:t, r12:t
+ mulhh.w r6, r0:t, r12:b
+ ld.w r12, pc[coef_table - . + 8]
+ mulhh.w r7, r2:t, r9:t
+ add r6, r5 // tmp2
+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
+ add r7, r5 // tmp3
+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r3:t, r1:t
+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
+
+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
+
+
+ addhh.w lr, r3:b, r1:b // lr = z4
+ addhh.w r5, r4:b, lr:b
+ mulhh.w r5, r5:b, r9:b // r5 = z5
+
+ ld.w r9, pc[coef_table - . + 12]
+ mulhh.w r4, r4:b, r12:t // r4 = z3
+ mulhh.w lr, lr:b, r12:b // lr = z4
+
+ add r4, r5
+ add lr, r5
+
+ addhh.w r5, r2:b, r1:b // r5 = z2
+ addhh.w r8, r3:b, r0:b // r8 = z1
+
+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
+ ld.w r12, pc[coef_table - . + 16]
+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
+ ld.w r9, pc[coef_table - . + 20]
+ mulhh.w r2, r2:b, r12:t // r2 = tmp2
+ mulhh.w r3, r3:b, r12:b // r3 = tmp3
+ mulhh.w r8, r8:b, r9:t // r8 = z1
+ mulhh.w r5, r5:b, r9:b // r5 = z2
+
+ add r0, r8
+ add r0, r4
+ add r1, r5
+ add r1, lr
+ add r2, r5
+ add r2, r4
+ add r3, r8
+ add r3, lr
+
+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
+
+ sthh.w r10[0], r4:t, r5:t
+ sthh.w r10[4], r3:t, r2:t
+ sthh.w r10[8], r2:b, r3:b
+ sthh.w r10[12], r5:b, r4:b
+
+ sub r10, -16
+ sub loop_cnt, 1
+ brne FOR_ROW, e
+
+COLOUMN_TRANSFORM:
+ sub r10, 128 //Set pointer to start of DCT block
+ mov loop_cnt, 8
+
+FOR_COLOUMN:
+ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
+
+ or r4, r1, r3 << 16
+ or r4, r2
+ or r4, r0
+ brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
+
+ lddsp r12, SP[0] // rfp
+ lddsp r9, SP[4] // iinc
+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
+ ld.d r0, r12[0]
+ sub r10, -2 // Increment the dataptr
+ bfins r3, r3, 16, 16
+ punpckub.h r2, r1:t
+ padd.h r2, r2, r3
+ punpckub.h r1, r1:b
+ padd.h r1, r1, r3
+ packsh.ub r1, r2, r1
+ punpckub.h r2, r0:t
+ padd.h r2, r2, r3
+ punpckub.h r0, r0:b
+ padd.h r0, r0, r3
+ packsh.ub r0, r2, r0
+ st.d r12[0], r0
+ add r12, r9 // increment rfp
+ stdsp SP[0], r12
+
+ sub loop_cnt, 1//Decrement loop counter
+ brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -8
+ popm r0-r3, r4-r7, pc//Pop back registers and PC
+
+AC_COLOUMN:
+ ld.w r12, pc[coef_table - .]
+ ld.w r9, pc[coef_table - . + 4]
+
+ addhh.w r4, r2:t, r2:b
+ mulhh.w r4, r4:b, r12:t // r4 = z1
+ mulhh.w r5, r2:b, r12:b
+ ld.w r12, pc[coef_table - . + 8]
+ mulhh.w r6, r2:t, r9:t
+ add r5, r4 // r5 = tmp2
+ add r6, r4 // r6 = tmp3
+
+ addhh.w r7, r3:t, r3:b
+ subhh.w r8, r3:t, r3:b
+
+ lsl r7, CONST_BITS
+ lsl r8, CONST_BITS
+
+ add r2, r7, r6 // r2 = tmp10
+ sub r3, r7, r6 // r3 = tmp13
+ add r4, r8, r5 // r4 = tmp11
+ sub r5, r8, r5 // r5 = tmp12
+
+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
+ addhh.w r7, r6:t, r6:b
+ mulhh.w r7, r7:b, r9:b // r7 = z5
+
+ ld.w r9, pc[coef_table - . + 12]
+ mulhh.w r8, r6:b, r12:t // r8 = z3
+ mulhh.w r6, r6:t, r12:b // r6 = z4
+
+ add r8, r7
+ add r6, r7
+
+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
+
+ mulhh.w r12, r0:b, r9:t // r12 = tmp0
+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
+ ld.w r9, pc[coef_table - . + 16]
+ add r12, r8
+ add r0, r6
+
+ ld.w lr, pc[coef_table - . + 20]
+ machh.w r8, r1:b, r9:t // r8 = tmp2
+ machh.w r6, r1:t, r9:b // r6 = tmp3
+ mulhh.w r9, r7:b, lr:t // r9 = z1
+ mulhh.w r7, r7:t, lr:b // r7 = z2
+
+ add r12, r9
+ add r0, r7
+ add r8, r7
+ add r6, r9
+
+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
+
+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
+
+ packw.sh r1, r1, r6
+ packw.sh r8, r8, r0
+ packw.sh r3, r3, r5
+ packw.sh r4, r4, r2
+
+ lddsp r12, SP[0] // rfp
+ lddsp r9, SP[4] // iinc
+ ld.d r6, r12[0]
+ sub r10, -2 // Increment the dataptr
+ punpckub.h r0, r7:t
+ padd.h r1, r1, r0
+ punpckub.h r0, r7:b
+ padd.h r8, r8, r0
+ packsh.ub r7, r1, r8
+ punpckub.h r0, r6:t
+ padd.h r3, r3, r0
+ punpckub.h r0, r6:b
+ padd.h r4, r4, r0
+ packsh.ub r6, r3, r4
+ st.d r12[0], r6
+ add r12, r9 // increment rfp
+ stdsp SP[0], r12
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -8
+ popm r0-r3, r4-r7, pc //Pop back registers and PC
+
+
+//Coeffisient Table:
+ .align 2
+coef_table:
+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
+
+
+idct_put_avr32:
+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
+
+ //; Give room for some variables on the stack
+ sub sp, 8
+ stdsp SP[0], r12 // rfp
+ stdsp SP[4], r11 // iinc
+
+ mov loop_cnt, 8 //Initialize loop counter
+
+0:
+
+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
+ mov r6, 0
+#ifdef USE_PREFETCH
+ pref r10[LINE_SIZE] //Prefetch next line
+#endif
+ or r4, r2, r3 << 16
+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
+ or r4, r0
+ brne 1f //If there are non-zero AC coeffisients perform row-transform
+
+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
+ plsl.h r5, r5, PASS1_BITS
+ mov r4, r5
+ st.d r10++, r4
+ st.d r10++, r4
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ bral 2f //Perform coloumn transform after row transform is computed
+
+1:
+ ld.w r12, pc[coef_table_copy - .]
+ ld.w r9, pc[coef_table_copy - . + 4]
+
+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
+ mulhh.w r5, r4:t, r12:t
+ mulhh.w r6, r0:t, r12:b
+ ld.w r12, pc[coef_table_copy - . + 8]
+ mulhh.w r7, r2:t, r9:t
+ add r6, r5 // tmp2
+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
+ add r7, r5 // tmp3
+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r3:t, r1:t
+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
+
+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
+
+ addhh.w lr, r3:b, r1:b // lr = z4
+ addhh.w r5, r4:b, lr:b
+ mulhh.w r5, r5:b, r9:b // r5 = z5
+
+ ld.w r9, pc[coef_table_copy - . + 12]
+ mulhh.w r4, r4:b, r12:t // r4 = z3
+ mulhh.w lr, lr:b, r12:b // lr = z4
+
+ add r4, r5
+ add lr, r5
+
+ addhh.w r5, r2:b, r1:b // r5 = z2
+ addhh.w r8, r3:b, r0:b // r8 = z1
+
+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
+ ld.w r12, pc[coef_table_copy - . + 16]
+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
+ ld.w r9, pc[coef_table_copy - . + 20]
+ mulhh.w r2, r2:b, r12:t // r2 = tmp2
+ mulhh.w r3, r3:b, r12:b // r3 = tmp3
+ mulhh.w r8, r8:b, r9:t // r8 = z1
+ mulhh.w r5, r5:b, r9:b // r5 = z2
+
+ add r0, r8
+ add r0, r4
+ add r1, r5
+ add r1, lr
+ add r2, r5
+ add r2, r4
+ add r3, r8
+ add r3, lr
+
+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
+
+ sthh.w r10[0], r4:t, r5:t
+ sthh.w r10[4], r3:t, r2:t
+ sthh.w r10[8], r2:b, r3:b
+ sthh.w r10[12], r5:b, r4:b
+
+ sub r10, -16
+ sub loop_cnt, 1
+ brne 0b
+
+2:
+
+ sub r10, 128 //Set pointer to start of DCT block
+
+ mov loop_cnt, 8
+
+0:
+ ldins.h r3:t,r10[0] // r3:t = dataptr[0]
+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
+
+ or r4, r1, r3 << 16
+ or r4, r2
+ or r4, r0
+ brne 1f //If there are non-zero AC coeffisients perform row-transform
+
+ lddsp r12, SP[0] // rfp
+ lddsp r9, SP[4] // iinc
+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
+ packw.sh r3, r3, r3
+ packsh.ub r3, r3, r3
+ mov r2, r3
+ st.d r12[0], r2
+ add r12, r9 // increment rfp
+ sub r10, -2 // Increment the dataptr
+ stdsp SP[0], r12
+
+ sub loop_cnt, 1//Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -8
+ popm r0-r3, r4-r7, pc//Pop back registers and PC
+
+1:
+ ld.w r12, pc[coef_table_copy - .]
+ ld.w r9, pc[coef_table_copy - . + 4]
+
+ addhh.w r4, r2:t, r2:b
+ mulhh.w r4, r4:b, r12:t // r4 = z1
+ mulhh.w r5, r2:b, r12:b
+ ld.w r12, pc[coef_table_copy - . + 8]
+ mulhh.w r6, r2:t, r9:t
+ add r5, r4 // r5 = tmp2
+ add r6, r4 // r6 = tmp3
+
+ addhh.w r7, r3:t, r3:b
+ subhh.w r8, r3:t, r3:b
+
+ lsl r7, CONST_BITS
+ lsl r8, CONST_BITS
+
+ add r2, r7, r6 // r2 = tmp10
+ sub r3, r7, r6 // r3 = tmp13
+ add r4, r8, r5 // r4 = tmp11
+ sub r5, r8, r5 // r5 = tmp12
+
+
+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
+ addhh.w r7, r6:t, r6:b
+ mulhh.w r7, r7:b, r9:b // r7 = z5
+
+ ld.w r9, pc[coef_table_copy - . + 12]
+ mulhh.w r8, r6:b, r12:t // r8 = z3
+ mulhh.w r6, r6:t, r12:b // r6 = z4
+
+ add r8, r7
+ add r6, r7
+
+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
+
+ mulhh.w r12, r0:b, r9:t // r12 = tmp0
+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
+ ld.w r9, pc[coef_table_copy - . + 16]
+ add r12, r8
+ add r0, r6
+
+ ld.w lr, pc[coef_table_copy - . + 20]
+ machh.w r8, r1:b, r9:t // r8 = tmp2
+ machh.w r6, r1:t, r9:b // r6 = tmp3
+ mulhh.w r9, r7:b, lr:t // r9 = z1
+ mulhh.w r7, r7:t, lr:b // r7 = z2
+
+ add r12, r9
+ add r0, r7
+ add r8, r7
+ add r6, r9
+
+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
+
+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
+
+ packw.sh r1, r1, r6
+ packw.sh r8, r8, r0
+ packw.sh r3, r3, r5
+ packw.sh r4, r4, r2
+
+ packsh.ub r1, r1, r8
+ packsh.ub r0, r3, r4
+ lddsp r12, SP[0] // rfp
+ lddsp r9, SP[4] // iinc
+ st.d r12[0], r0
+ sub r10, -2 // Increment the dataptr
+ add r12, r9 // increment rfp
+ stdsp SP[0], r12
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -8
+ popm r0-r3, r4-r7, pc //Pop back registers and PC
+
+ .align 2
+coef_table_copy:
+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
+
+idct_avr32:
+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
+
+ //; Give room for a temporary block on the stack
+ sub sp, 8*8*2
+
+ mov loop_cnt, 8 //Initialize loop counter
+
+0:
+ ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
+ mov r6, 0
+#ifdef USE_PREFETCH
+ pref r12[LINE_SIZE] //Prefetch next line
+#endif
+ or r4, r2, r3 << 16
+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero
+ or r4, r0
+ brne 1f //If there are non-zero AC coeffisients perform row-transform
+
+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
+ plsl.h r5, r5, PASS1_BITS
+ mov r4, r5
+ st.d sp++, r4
+ st.d sp++, r4
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ bral 2f //Perform coloumn transform after row transform is computed
+
+1:
+ ld.w r10, pc[coef_table_idct - .]
+ ld.w r9, pc[coef_table_idct - . + 4]
+
+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
+ mulhh.w r5, r4:t, r10:t
+ mulhh.w r6, r0:t, r10:b
+ ld.w r10, pc[coef_table_idct - . + 8]
+ mulhh.w r7, r2:t, r9:t
+ add r6, r5 // tmp2
+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
+ add r7, r5 // tmp3
+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r3:t, r1:t
+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
+
+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
+
+ addhh.w lr, r3:b, r1:b // lr = z4
+ addhh.w r5, r4:b, lr:b
+ mulhh.w r5, r5:b, r9:b // r5 = z5
+
+ ld.w r9, pc[coef_table_idct - . + 12]
+ mulhh.w r4, r4:b, r10:t // r4 = z3
+ mulhh.w lr, lr:b, r10:b // lr = z4
+
+ add r4, r5
+ add lr, r5
+ addhh.w r5, r2:b, r1:b // r5 = z2
+ addhh.w r8, r3:b, r0:b // r8 = z1
+
+ mulhh.w r0, r0:b, r9:t // r0 = tmp0
+ ld.w r10, pc[coef_table_idct - . + 16]
+ mulhh.w r1, r1:b, r9:b // r1 = tmp1
+ ld.w r9, pc[coef_table_idct - . + 20]
+ mulhh.w r2, r2:b, r10:t // r2 = tmp2
+ mulhh.w r3, r3:b, r10:b // r3 = tmp3
+ mulhh.w r8, r8:b, r9:t // r8 = z1
+ mulhh.w r5, r5:b, r9:b // r5 = z2
+
+ add r0, r8
+ add r0, r4
+ add r1, r5
+ add r1, lr
+ add r2, r5
+ add r2, r4
+ add r3, r8
+ add r3, lr
+
+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
+
+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
+
+ sthh.w sp[0], r4:t, r5:t
+ sthh.w sp[4], r3:t, r2:t
+ sthh.w sp[8], r2:b, r3:b
+ sthh.w sp[12], r5:b, r4:b
+
+ sub sp, -16
+ sub loop_cnt, 1
+ brne 0b
+
+2:
+ sub sp, 8*8*2 //Set pointer to start of DCT block
+ sub r12, 8*8*2 //Set pointer to start of DCT block
+
+ mov loop_cnt, 8
+
+0:
+ ldins.h r3:t,sp[0] // r3:t = dataptr[0]
+ ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
+ ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
+ ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
+ ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
+ ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
+ ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
+ ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
+
+ or r4, r1, r3 << 16
+ or r4, r2
+ or r4, r0
+ brne 1f //If there are non-zero AC coeffisients perform row-transform
+
+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
+ packw.sh r3, r3, r3
+ mov r2, r3
+ st.d r12++, r2
+ st.d r12++, r2
+ sub sp, -2 // Increment the dataptr
+
+ sub loop_cnt, 1//Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -(8*8*2 - 8)
+ popm r0-r3, r4-r7, pc//Pop back registers and PC
+
+1:
+ ld.w r10, pc[coef_table_idct - .]
+ ld.w r9, pc[coef_table_idct - . + 4]
+
+ addhh.w r4, r2:t, r2:b
+ mulhh.w r4, r4:b, r10:t // r4 = z1
+ mulhh.w r5, r2:b, r10:b
+ ld.w r10, pc[coef_table_idct - . + 8]
+ mulhh.w r6, r2:t, r9:t
+ add r5, r4 // r5 = tmp2
+ add r6, r4 // r6 = tmp3
+
+ addhh.w r7, r3:t, r3:b
+ subhh.w r8, r3:t, r3:b
+
+ lsl r7, CONST_BITS
+ lsl r8, CONST_BITS
+
+ add r2, r7, r6 // r2 = tmp10
+ sub r3, r7, r6 // r3 = tmp13
+ add r4, r8, r5 // r4 = tmp11
+ sub r5, r8, r5 // r5 = tmp12
+
+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
+ addhh.w r7, r6:t, r6:b
+ mulhh.w r7, r7:b, r9:b // r7 = z5
+
+ ld.w r9, pc[coef_table_idct - . + 12]
+ mulhh.w r8, r6:b, r10:t // r8 = z3
+ mulhh.w r6, r6:t, r10:b // r6 = z4
+
+ add r8, r7
+ add r6, r7
+
+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
+
+ mulhh.w r10, r0:b, r9:t // r10 = tmp0
+ mulhh.w r0, r0:t, r9:b // r0 = tmp1
+ ld.w r9, pc[coef_table_idct - . + 16]
+ add r10, r8
+ add r0, r6
+
+ ld.w lr, pc[coef_table_idct - . + 20]
+ machh.w r8, r1:b, r9:t // r8 = tmp2
+ machh.w r6, r1:t, r9:b // r6 = tmp3
+ mulhh.w r9, r7:b, lr:t // r9 = z1
+ mulhh.w r7, r7:t, lr:b // r7 = z2
+
+ add r10, r9
+ add r0, r7
+ add r8, r7
+ add r6, r9
+
+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
+ add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
+ sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
+
+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
+
+ packw.sh r7, r1, r6
+ packw.sh r6, r8, r0
+ packw.sh r5, r3, r5
+ packw.sh r4, r4, r2
+
+ stm r12, r4-r7
+ sub sp, -2 // Increment the dataptr
+ sub r12, -16
+
+ sub loop_cnt, 1 //Decrement loop counter
+ brne 0b //Perform loop one more time if loop_cnt is not zero
+
+ sub sp, -(8*8*2 - 8)
+ popm r0-r3, r4-r7, pc //Pop back registers and PC
+
+
+ .align 2
+coef_table_idct:
+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
+
diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
new file mode 100644
index 0000000..2ffcf7f
--- /dev/null
+++ b/libavcodec/avr32/mc.S
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+
+ /* Macro for masking the lowest bit of each byte in a
+ packed word */
+ .macro packedmask1 reg, round
+ .if \round
+ and \reg, \reg, r8 >> 1
+ .else
+ and \reg, r8
+ .endif
+ .endm
+
+ /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
+ .macro pixels8_hv round, put
+
+
+ pushm r0-r7, lr
+
+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
+
+ /* Rounding immediate */
+ .if \round
+ mov r8, lo(0x02020202)
+ orh r8, hi(0x02020202)
+ .else
+ mov r8, lo(0x01010101)
+ orh r8, hi(0x01010101)
+ .endif
+ mov r7, 2
+
+ /* Pixel naming convention :
+
+ |-----------------------------------------------------|
+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
+ |----d00---d01---d02---d03---d04---d05---d06---d07----|
+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
+ |-----------------------------------------------------|
+ */
+1:
+ ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
+ ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
+ mov lr, r9
+ eor r2, r0, r1
+ packedmask1 r2, \round
+ add r2, r8
+
+ paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+
+ add r11, r10 // pixels += line_size
+ ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
+0:
+ eor r5, r1, r3
+ packedmask1 r5, \round
+ add r2, r5
+
+ paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
+ eor r6, r0, r1
+ packedmask1 r6, \round
+ add r2, r2, r6 << 1
+
+ ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
+ add r11, r10 // pixels += line_size
+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
+
+ paddh.ub r0, r0, r1
+ plsr.b r2, r2, 2
+ padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
+
+ /* Next row */
+ .if \put
+ eor r2, r3, r4
+ packedmask1 r2, \round
+ add r2, r8
+ .else
+ ld.w r6, r12[0]
+ eor r2, r3, r4
+ packedmask1 r2, \round
+ add r2, r8
+ pavg.ub r0, r0, r6
+ .endif
+ st.w r12[0], r0 // Put data into the block
+
+ add r5, r2
+ paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+
+ eor r6, r0, r1
+ packedmask1 r6, \round
+ add r5, r5, r6 << 1
+
+ .if \put
+ paddh.ub r1, r0, r1
+ plsr.b r5, r5, 2
+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
+ .else
+ ld.w r3, r12[r10]
+ paddh.ub r1, r0, r1
+ plsr.b r5, r5, 2
+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
+ pavg.ub r1, r1, r3
+ .endif
+
+ st.w r12[r10], r1 // Put data into the block
+
+ ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
+ add r11, r10 // pixels += line_size
+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
+ add r12, r12, r10 << 1 // block += 2*line_size
+ sub lr, 2
+ brne 0b
+
+ mul r0, r10, r9 // r0 = line_size * h
+ rsub r0, r0, 4 // r0 = 4 - (line_size * h)
+ add r11, r0
+ sub r11, r10 // pixels += 4 - (line_size * (h+1))
+ add r12, r0 // pixels += 4 - (line_size * (h))
+ sub r7, 1
+ brne 1b
+
+ popm r0-r7, pc
+ .endm
+
+
+ /* Macro for 8 pixel wide vertical interpolation functions */
+
+ .macro pixels8_v round, put
+ pushm r4-r7,lr
+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
+
+ /*
+ Pixel Naming Convention :
+ |-----------------------------------------------|
+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
+ |-d00---d01---d02---d03---d04---d05---d06---d07-|
+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
+ |-----------------------------------------------|
+ */
+ ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
+ ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
+ ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
+ ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
+ sub r10, 4 // stride -= 4
+ add r11, r11, r10 << 1 // src += 2*stride
+ sub r11, -4 // src += 4
+
+0:
+ .if \round
+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
+ .else
+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
+ .endif
+
+ .if \put
+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
+ .else
+ ld.w lr, r12[0]
+ ld.w r7, r12[4]
+ pavg.ub r5, r5, lr
+ pavg.ub r4, r4, r7
+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
+ .endif
+ add r11, r10 // src += stride
+#ifdef USE_PREFETCH
+ pref r11[0]
+#endif
+ add r12, r10 // dst += stride
+
+ .if \round
+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
+ .else
+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
+ .endif
+ .if \put
+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
+ .else
+ ld.w r8, r12[0]
+ ld.w r6, r12[4]
+ pavg.ub r5, r5, r8
+ pavg.ub r4, r4, r6
+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
+ .endif
+
+ add r11, r10 // src += stride
+#ifdef USE_PREFETCH
+ pref r11[0]
+#endif
+ add r12, r10 // dst += stride
+ sub r9, 2
+ brne 0b
+
+ popm r4-r7,pc
+ .endm
+
+ /* Macro for 8 pixel wide horizontal interpolation functions */
+
+ .macro pixels8_h round, put
+ pushm r4-r7, lr
+
+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
+ /*
+ Pixel Naming Convention:
+ |--------------------------------------------------------------------|
+ | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
+ |------|-------|-------|-------|-------|-------|-------|-------|-----|
+ | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
+ |--------------------------------------------------------------------|
+ */
+
+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
+ add r11, r10 // src += stride
+
+0:
+ .if \round
+ pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+ pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
+ .else
+ paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+ paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
+ .endif
+ .if \put
+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
+ .else
+ ld.w r8, r12[0]
+ ld.w r6, r12[4]
+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
+ pavg.ub lr, lr, r8
+ pavg.ub r7, r7, r6
+ .endif
+ st.w r12[0], lr // dst = { d00, d01, d02, d03 }
+ st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
+ ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
+ add r11, r10 // src += stride
+#ifdef USE_PREFETCH
+ pref r11[0]
+#endif
+ add r12, r10 // dst += stride
+
+ .if \round
+ pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+ pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
+ .else
+ paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
+ paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
+ .endif
+ .if \put
+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
+ .else
+ ld.w r7, r12[0]
+ ld.w r6, r12[4]
+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
+ pavg.ub r5, r5, r7
+ pavg.ub r4, r4, r6
+ .endif
+ st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
+ st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
+ add r11, r10 // src += stride
+#ifdef USE_PREFETCH
+ pref r11[0]
+#endif
+ add r12, r10 // dst += stride
+ sub r9, 2
+ brne 0b
+
+ popm r4-r7, pc
+ .endm
+
+ /* Macro for 8 pixel wide copy functions */
+ .macro pixels8 put
+ stm --sp, r3-r7,lr
+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
+ mov lr, r9
+ sub r3, r10, 2 // stride2 = stride - 2
+0:
+ .if \put
+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
+ .else
+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
+ ld.d r4, r12[0]
+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
+ pavg.ub r6, r6, r4
+ pavg.ub r7, r7, r5
+ ld.d r4, r12[r10]
+ .endif
+ st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
+ add r11, r11, r3 << 1 // src += stride2 * 2
+ .ifeq \put
+ pavg.ub r8, r8, r4
+ pavg.ub r9, r9, r5
+ .endif
+ st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
+ add r12, r12, r10 << 1 // dst += 2*stride
+ sub lr, 2
+ brne 0b
+ ldm sp++, r3-r7,pc
+
+ .endm
+
+ .global put_no_rnd_pixels8_hv_avr32
+ .text
+put_no_rnd_pixels8_hv_avr32:
+ pixels8_hv 0, 1
+
+ .global put_pixels8_hv_avr32
+ .text
+put_pixels8_hv_avr32:
+ pixels8_hv 1, 1
+
+ .global avg_no_rnd_pixels8_hv_avr32
+ .text
+avg_no_rnd_pixels8_hv_avr32:
+ pixels8_hv 0, 0
+
+ .global avg_pixels8_hv_avr32
+ .text
+avg_pixels8_hv_avr32:
+ pixels8_hv 1, 0
+
+ .global put_no_rnd_pixels8_v_avr32
+ .text
+put_no_rnd_pixels8_v_avr32:
+ pixels8_v 0, 1
+
+ .global put_pixels8_v_avr32
+ .text
+put_pixels8_v_avr32:
+ pixels8_v 1, 1
+
+ .global avg_no_rnd_pixels8_v_avr32
+ .text
+avg_no_rnd_pixels8_v_avr32:
+ pixels8_v 0, 0
+
+ .global avg_pixels8_v_avr32
+ .text
+avg_pixels8_v_avr32:
+ pixels8_v 1, 0
+
+ .global put_no_rnd_pixels8_h_avr32
+ .text
+put_no_rnd_pixels8_h_avr32:
+ pixels8_h 0, 1
+
+ .global put_pixels8_h_avr32
+ .text
+put_pixels8_h_avr32:
+ pixels8_h 1, 1
+
+ .global avg_no_rnd_pixels8_h_avr32
+ .text
+avg_no_rnd_pixels8_h_avr32:
+ pixels8_h 0, 0
+
+ .global avg_pixels8_h_avr32
+ .text
+avg_pixels8_h_avr32:
+ pixels8_h 1, 0
+
+ .global put_pixels8_avr32
+ .global put_no_rnd_pixels8_avr32
+ .text
+put_pixels8_avr32:
+put_no_rnd_pixels8_avr32:
+ pixels8 1
+
+ .global avg_no_rnd_pixels8_avr32
+ .global avg_pixels8_avr32
+ .text
+avg_pixels8_avr32:
+avg_no_rnd_pixels8_avr32:
+ pixels8 0
+
diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
new file mode 100644
index 0000000..f2020fd
--- /dev/null
+++ b/libavcodec/avr32/pico.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. The name of ATMEL may not be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+#ifndef __PICO_H__
+#define __PICO_H__
+
+/* Coprocessor Number */
+#define PICO_CPNO 1
+
+/* Pixel Coprocessor Register file */
+#define PICO_REGVECT_INPIX2 cr0
+#define PICO_REGVECT_INPIX1 cr1
+#define PICO_REGVECT_INPIX0 cr2
+#define PICO_REGVECT_OUTPIX2 cr3
+#define PICO_REGVECT_OUTPIX1 cr4
+#define PICO_REGVECT_OUTPIX0 cr5
+#define PICO_REGVECT_COEFF0_A cr6
+#define PICO_REGVECT_COEFF0_B cr7
+#define PICO_REGVECT_COEFF1_A cr8
+#define PICO_REGVECT_COEFF1_B cr9
+#define PICO_REGVECT_COEFF2_A cr10
+#define PICO_REGVECT_COEFF2_B cr11
+#define PICO_REGVECT_VMU0_OUT cr12
+#define PICO_REGVECT_VMU1_OUT cr13
+#define PICO_REGVECT_VMU2_OUT cr14
+#define PICO_REGVECT_CONFIG cr15
+
+#define PICO_INPIX2 0
+#define PICO_INPIX1 1
+#define PICO_INPIX0 2
+#define PICO_OUTPIX2 3
+#define PICO_OUTPIX1 4
+#define PICO_OUTPIX0 5
+#define PICO_COEFF0_A 6
+#define PICO_COEFF0_B 7
+#define PICO_COEFF1_A 8
+#define PICO_COEFF1_B 9
+#define PICO_COEFF2_A 10
+#define PICO_COEFF2_B 11
+#define PICO_VMU0_OUT 12
+#define PICO_VMU1_OUT 13
+#define PICO_VMU2_OUT 14
+#define PICO_CONFIG 15
+
+/* Config Register */
+#define PICO_COEFF_FRAC_BITS_OFFSET 0
+#define PICO_COEFF_FRAC_BITS_SIZE 4
+#define PICO_OFFSET_FRAC_BITS_OFFSET 4
+#define PICO_OFFSET_FRAC_BITS_SIZE 4
+#define PICO_INPUT_MODE_OFFSET 8
+#define PICO_INPUT_MODE_SIZE 2
+#define PICO_OUTPUT_MODE_OFFSET 10
+#define PICO_OUTPUT_MODE_SIZE 1
+
+struct pico_config_t {
+ unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
+ unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
+ unsigned int input_mode : PICO_INPUT_MODE_SIZE;
+ unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
+ unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
+ int vmu2_out;
+ int vmu1_out;
+ int vmu0_out;
+ short coeff2_2;
+ short coeff2_3;
+ short coeff2_0;
+ short coeff2_1;
+ short coeff1_2;
+ short coeff1_3;
+ short coeff1_0;
+ short coeff1_1;
+ short coeff0_2;
+ short coeff0_3;
+ short coeff0_0;
+ short coeff0_1;
+};
+
+
+#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
+#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
+#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
+#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
+
+#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
+#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
+#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
+#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
+
+enum pico_input_mode { PICO_TRANSFORMATION_MODE,
+ PICO_HOR_FILTER_MODE,
+ PICO_VERT_FILTER_MODE };
+
+enum pico_output_mode { PICO_PACKED_MODE,
+ PICO_PLANAR_MODE };
+
+/* Bits in coefficients */
+#define PICO_COEFF_BITS 12
+
+/* Operation bits */
+#define PICO_MATRIX (0)
+#define PICO_USE_ACC (1 << 2)
+#define PICO_SINGLE_VECTOR (1 << 3)
+
+
+#define __str(x...) #x
+#define __xstr(x...) __str(x)
+
+#define PICO_PUT_W(pico_reg, x) \
+ __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
+#define PICO_GET_W(pico_reg) \
+ __builtin_mvcr_w(PICO_CPNO, pico_reg)
+
+#define PICO_MVCR_W(x, pico_reg) \
+ asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
+
+#define PICO_MVRC_W(pico_reg, x) \
+ asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
+
+#define PICO_PUT_D(pico_reg, x) \
+ __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
+#define PICO_GET_D(pico_reg) \
+ __builtin_mvcr_d(PICO_CPNO, pico_reg)
+
+#define PICO_MVCR_D(x, pico_reg) \
+ asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
+#define PICO_MVRC_D(pico_reg, x) \
+ asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
+
+#define PICO_STCM_W(ptr, pico_regs...) \
+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
+#define PICO_STCM_D(ptr, pico_regs...) \
+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
+
+#define PICO_STCM_W_DEC(ptr, pico_regs...) \
+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
+#define PICO_STCM_D_DEC(ptr, pico_regs...) \
+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
+
+#define PICO_LDCM_W(ptr, pico_regs...) \
+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
+#define PICO_LDCM_D(ptr, pico_regs...) \
+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
+
+#define PICO_LDCM_W_INC(ptr, pico_regs...) \
+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
+#define PICO_LDCM_D_INC(ptr, pico_regs...) \
+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
+
+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
+ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
+
+static inline void set_pico_config(struct pico_config_t *config){
+ PICO_LDCM_D(config,
+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
+}
+
+static inline void get_pico_config(struct pico_config_t *config){
+ PICO_STCM_D(config,
+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
+}
+
+static inline void dump_pico_config(){
+ struct pico_config_t pico_config;
+ char *input_mode, *output_mode;
+ get_pico_config(&pico_config);
+
+ av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
+ av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
+ av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
+
+ switch ( pico_config.input_mode ){
+ case PICO_TRANSFORMATION_MODE:
+ input_mode = "Transformation Mode";
+ break;
+ case PICO_HOR_FILTER_MODE:
+ input_mode = "Horisontal Filter Mode";
+ break;
+ case PICO_VERT_FILTER_MODE:
+ input_mode = "Vertical Filter Mode";
+ break;
+ default:
+ input_mode = "Unknown Mode!!";
+ break;
+ }
+ av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
+
+ switch ( pico_config.output_mode ){
+ case PICO_PLANAR_MODE:
+ output_mode = "Planar Mode";
+ break;
+ case PICO_PACKED_MODE:
+ output_mode = "Packed Mode";
+ break;
+ default:
+ output_mode = "Unknown Mode!!";
+ break;
+ }
+
+ av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
+
+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
+
+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
+
+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
+}
+
+#endif
+
diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
index e1ec934..f2095be 100644
--- a/libavcodec/bitstream.h
+++ b/libavcodec/bitstream.h
@@ -181,6 +181,7 @@ typedef struct RL_VLC_ELEM {
uint8_t run;
} RL_VLC_ELEM;
+
#ifndef ALT_BITSTREAM_WRITER
static inline void put_bits(PutBitContext *s, int n, unsigned int value)
{
@@ -799,6 +800,44 @@ void free_vlc(VLC *vlc);
* if the vlc code is invalid and max_depth>1 than the number of bits removed
* is undefined
*/
+
+#if defined(ARCH_AVR32)
+#define GET_VLC(code, name, gb, table, bits, max_depth)\
+{\
+ int n, index, nb_bits;\
+ union { VLC_TYPE vlc[2];\
+ uint32_t u32; } table_elem;\
+\
+ index= SHOW_UBITS(name, gb, bits);\
+ table_elem.u32 = (*((uint32_t*)(&table[index]))); \
+ code = table_elem.vlc[0];\
+ n = table_elem.vlc[1];\
+\
+ if(max_depth > 1 && n < 0 ){\
+ LAST_SKIP_BITS(name, gb, bits)\
+ UPDATE_CACHE(name, gb)\
+\
+ nb_bits = -n;\
+\
+ index= SHOW_UBITS(name, gb, nb_bits) + code;\
+ table_elem.u32 = (*((uint32_t*)(&table[index]))); \
+ code = table_elem.vlc[0];\
+ n = table_elem.vlc[1];\
+ if(max_depth > 2 && n < 0){\
+ LAST_SKIP_BITS(name, gb, nb_bits)\
+ UPDATE_CACHE(name, gb)\
+\
+ nb_bits = -n;\
+\
+ index= SHOW_UBITS(name, gb, nb_bits) + code;\
+ code = table[index][0];\
+ n = table[index][1];\
+ }\
+ }\
+ SKIP_BITS(name, gb, n)\
+}
+
+#else
#define GET_VLC(code, name, gb, table, bits, max_depth)\
{\
int n, index, nb_bits;\
@@ -807,7 +846,7 @@ void free_vlc(VLC *vlc);
code = table[index][0];\
n = table[index][1];\
\
- if(max_depth > 1 && n < 0){\
+ if(max_depth > 1 && n < 0 ){\
LAST_SKIP_BITS(name, gb, bits)\
UPDATE_CACHE(name, gb)\
\
@@ -829,7 +868,38 @@ void free_vlc(VLC *vlc);
}\
SKIP_BITS(name, gb, n)\
}
+#endif
+#if defined(ARCH_AVR32)
+#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
+{\
+ int n, index, nb_bits;\
+ union { RL_VLC_ELEM vlc;\
+ uint32_t u32; } table_elem;\
+\
+ index= SHOW_UBITS(name, gb, bits);\
+ table_elem.u32 = (*((uint32_t*)(&table[index]))); \
+ level = table_elem.vlc.level;\
+ n = table_elem.vlc.len;\
+\
+ if(max_depth > 1 && n < 0 ){\
+ SKIP_BITS(name, gb, bits)\
+ if(need_update){\
+ UPDATE_CACHE(name, gb)\
+ }\
+\
+ nb_bits = -n;\
+\
+ index= SHOW_UBITS(name, gb, nb_bits) + level;\
+ table_elem.u32 = (*((uint32_t*)(&table[index]))); \
+ level = table_elem.vlc.level;\
+ n = table_elem.vlc.len;\
+ }\
+ run= table_elem.vlc.run;\
+ SKIP_BITS(name, gb, n)\
+}
+
+#else
#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
{\
int n, index, nb_bits;\
@@ -838,7 +908,7 @@ void free_vlc(VLC *vlc);
level = table[index].level;\
n = table[index].len;\
\
- if(max_depth > 1 && n < 0){\
+ if(max_depth > 1 && n < 0 ){\
SKIP_BITS(name, gb, bits)\
if(need_update){\
UPDATE_CACHE(name, gb)\
@@ -853,7 +923,7 @@ void free_vlc(VLC *vlc);
run= table[index].run;\
SKIP_BITS(name, gb, n)\
}
-
+#endif
/**
* parses a vlc code, faster then get_vlc()
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index b33fb50..30418bf 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4644,6 +4644,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
if (HAVE_MMX) dsputil_init_mmx (c, avctx);
if (ARCH_ARM) dsputil_init_arm (c, avctx);
+ if (ARCH_AVR32) dsputil_init_avr32 (c, avctx);
if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
if (HAVE_VIS) dsputil_init_vis (c, avctx);
if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 285f9c9..133b918 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -2010,7 +2010,12 @@ static void free_tables(H264Context *h){
static void init_dequant8_coeff_table(H264Context *h){
int i,q,x;
+#ifdef ARCH_AVR32
+ const int transpose = 0;
+#else
const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
+#endif
+
h->dequant8_coeff[0] = h->dequant8_buffer[0];
h->dequant8_coeff[1] = h->dequant8_buffer[1];
@@ -2033,7 +2038,13 @@ static void init_dequant8_coeff_table(H264Context *h){
static void init_dequant4_coeff_table(H264Context *h){
int i,j,q,x;
+ // Yes this is ugly as hell....
+#ifdef ARCH_AVR32
+ const int transpose = 0;
+#else
const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
+#endif
+
for(i=0; i<6; i++ ){
h->dequant4_coeff[i] = h->dequant4_buffer[i];
for(j=0; j<i; j++){
@@ -3560,7 +3571,11 @@ static int init_poc(H264Context *h){
static void init_scan_tables(H264Context *h){
MpegEncContext * const s = &h->s;
int i;
+#ifdef ARCH_AVR32
+ if(1){
+#else
if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
+#endif
memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
}else{
--
1.5.6.3
More information about the MPlayer-dev-eng
mailing list