[FFmpeg-cvslog] Merge commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c'

Tue Mar 21 20:27:07 EET 2017

ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Mar 21 15:20:45 2017 -0300| [a8474df9447d6466c77d3ec8f414cda2662f057b] | committer: James Almer

Merge commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c'

* commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c':
  h264chroma: Change type of stride parameters to ptrdiff_t

Merged-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a8474df9447d6466c77d3ec8f414cda2662f057b
---

 libavcodec/aarch64/h264chroma_init_aarch64.c | 12 ++++++------
 libavcodec/aarch64/h264cmc_neon.S            |  7 ++-----
 libavcodec/aarch64/rv40dsp_init_aarch64.c    | 16 ++++++++--------
 libavcodec/aarch64/vc1dsp_init_aarch64.c     | 16 ++++++++--------
 libavcodec/arm/h264chroma_init_arm.c         | 18 ++++++++++++------
 libavcodec/arm/h264cmc_neon.S                |  4 ++--
 libavcodec/arm/vc1dsp_init_neon.c            | 16 ++++++++--------
 libavcodec/h264chroma.h                      |  3 ++-
 libavcodec/h264chroma_template.c             | 21 +++++++++++++--------
 libavcodec/mips/h264chroma_mips.h            | 20 ++++++++++----------
 libavcodec/mips/h264chroma_mmi.c             |  8 ++++----
 libavcodec/mips/h264chroma_msa.c             | 12 ++++++------
 libavcodec/ppc/h264chroma_template.c         |  9 +++++++--
 libavcodec/rv40dsp.c                         | 14 ++++++++++----
 libavcodec/vc1dsp.c                          |  8 ++++----
 libavcodec/x86/h264_chromamc.asm             | 18 +-----------------
 libavcodec/x86/h264_chromamc_10bit.asm       | 15 ++++++---------
 libavcodec/x86/h264chroma_init.c             | 26 +++++++++++++-------------
 libavcodec/x86/rv40dsp_init.c                | 12 ++++++------
 libavcodec/x86/vc1dsp_init.c                 | 10 +++++-----
 20 files changed, 133 insertions(+), 132 deletions(-)

diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index 2af62be..fa6e0ea 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -28,18 +28,18 @@
 
 #include "config.h"
 
-void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
-void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
-void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
 
-void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
 
 av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index ff97a29..8be7578 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -21,10 +21,9 @@
 
 #include "libavutil/aarch64/asm.S"
 
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc8 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
-        sxtw            x2,  w2
   .ifc \type,avg
         mov             x8,  x0
   .endif
@@ -192,10 +191,9 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
 endfunc
 .endm
 
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc4 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
-        sxtw            x2,  w2
   .ifc \type,avg
         mov             x8,  x0
   .endif
@@ -359,7 +357,6 @@ endfunc
 
 .macro  h264_chroma_mc2 type
 function ff_\type\()_h264_chroma_mc2_neon, export=1
-        sxtw            x2,  w2
         prfm            pldl1strm, [x1]
         prfm            pldl1strm, [x1, x2]
         orr             w7,  w4,  w5
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index 764bc1e..142705d 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -25,15 +25,15 @@
 
 #include "config.h"
 
-void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
-void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
+void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
 
-void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
-void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
 
 av_cold void ff_rv40dsp_init_aarch64(RV34DSPContext *c)
 {
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index e59e55e..13dfd74 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,14 +25,14 @@
 
 #include "config.h"
 
-void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
 
 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 {
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 13f7e0d..aae804b 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -26,13 +26,19 @@
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/h264chroma.h"
 
-void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
 
-void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
 
 av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
 {
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index fc48a6f..5a4159e 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -20,7 +20,7 @@
 
 #include "libavutil/arm/asm.S"
 
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc8 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
         push            {r4-r7, lr}
@@ -195,7 +195,7 @@ T       cmp             r7,  #0
 endfunc
 .endm
 
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc4 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
         push            {r4-r7, lr}
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index c340144..005d45c 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -70,14 +70,14 @@ DECL_PUT(3, 1)
 DECL_PUT(3, 2)
 DECL_PUT(3, 3)
 
-void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
-void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                int x, int y);
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
 
 #define FN_ASSIGN(X, Y) \
     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index e0f45ad..5c89fd1 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -19,9 +19,10 @@
 #ifndef AVCODEC_H264CHROMA_H
 #define AVCODEC_H264CHROMA_H
 
+#include <stddef.h>
 #include <stdint.h>
 
-typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y);
 
 typedef struct H264ChromaContext {
     h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index 072b5e0..a3ca07b 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -19,12 +19,14 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avassert.h"
 
+#include <stddef.h>
+
+#include "libavutil/avassert.h"
 #include "bit_depth_template.c"
 
 #define H264_CHROMA_MC(OPNAME, OP)\
-static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
     const int A=(8-x)*(8-y);\
@@ -58,7 +60,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *
         }\
     }\
 }\
-static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
     const int A=(8-x)*(8-y);\
@@ -79,7 +82,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
         }\
     } else if (B + C) {\
         const int E= B+C;\
-        const int step= C ? stride : 1;\
+        const ptrdiff_t step = C ? stride : 1;\
         for(i=0; i<h; i++){\
             OP(dst[0], (A*src[0] + E*src[step+0]));\
             OP(dst[1], (A*src[1] + E*src[step+1]));\
@@ -96,7 +99,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
     }\
 }\
 \
-static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
     const int A=(8-x)*(8-y);\
@@ -119,7 +123,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
         }\
     } else if (B + C) {\
         const int E= B+C;\
-        const int step= C ? stride : 1;\
+        const ptrdiff_t step = C ? stride : 1;\
         for(i=0; i<h; i++){\
             OP(dst[0], (A*src[0] + E*src[step+0]));\
             OP(dst[1], (A*src[1] + E*src[step+1]));\
@@ -140,7 +144,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
     }\
 }\
 \
-static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
     const int A=(8-x)*(8-y);\
@@ -167,7 +172,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *
         }\
     } else if (B + C) {\
         const int E= B+C;\
-        const int step= C ? stride : 1;\
+        const ptrdiff_t step = C ? stride : 1;\
         for(i=0; i<h; i++){\
             OP(dst[0], (A*src[0] + E*src[step+0]));\
             OP(dst[1], (A*src[1] + E*src[step+1]));\
diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h
index 6e6127d..996384d 100644
--- a/libavcodec/mips/h264chroma_mips.h
+++ b/libavcodec/mips/h264chroma_mips.h
@@ -22,26 +22,26 @@
 #define AVCODEC_MIPS_H264CHROMA_MIPS_H
 
 #include "libavcodec/h264dec.h"
-void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
-void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
-void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
-void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
-void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
-void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int height, int x, int y);
 
-void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y);
-void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y);
-void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y);
-void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y);
 
 #endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
index 417b4a2..bafe0f9 100644
--- a/libavcodec/mips/h264chroma_mmi.c
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -26,7 +26,7 @@
 #include "constants.h"
 #include "libavutil/mips/mmiutils.h"
 
-void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
@@ -207,7 +207,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     }
 }
 
-void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
@@ -396,7 +396,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     }
 }
 
-void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
@@ -546,7 +546,7 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
     }
 }
 
-void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
         int h, int x, int y)
 {
     const int A = (8 - x) *(8 - y);
diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 67d0bc1..940e12d 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1869,7 +1869,7 @@ static void avg_width8_msa(uint8_t *src, int32_t src_stride,
 }
 
 void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
@@ -1886,7 +1886,7 @@ void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
 }
 
 void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     int32_t cnt;
 
@@ -1910,7 +1910,7 @@ void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
 }
 
 void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     int32_t cnt;
 
@@ -1934,7 +1934,7 @@ void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
 }
 
 void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
@@ -1955,7 +1955,7 @@ void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
 }
 
 void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
@@ -1975,7 +1975,7 @@ void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
 }
 
 void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
-                                int stride, int height, int x, int y)
+                                ptrdiff_t stride, int height, int x, int y)
 {
     int32_t cnt;
 
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index cb1e095..d9b2a61 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -111,7 +111,9 @@
 
 #ifdef PREFIX_h264_chroma_mc8_altivec
 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
-                                    int stride, int h, int x, int y) {
+                                           ptrdiff_t stride, int h,
+                                           int x, int y)
+{
     DECLARE_ALIGNED(16, signed int, ABCD)[4] =
                         {((8 - x) * (8 - y)),
                          ((    x) * (8 - y)),
@@ -183,7 +185,10 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
 
 /* this code assume that stride % 16 == 0 */
 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
-static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
+static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
+                                                 ptrdiff_t stride, int h,
+                                                 int x, int y)
+{
    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
                         {((8 - x) * (8 - y)),
                          ((    x) * (8 - y)),
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index 95ba0a9..5579bd9 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -292,7 +292,10 @@ static const int rv40_bias[4][4] = {
 };
 
 #define RV40_CHROMA_MC(OPNAME, OP)\
-static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
+                                        uint8_t *src /*align 1*/,\
+                                        ptrdiff_t stride, int h, int x, int y)\
+{\
     const int A = (8-x) * (8-y);\
     const int B = (  x) * (8-y);\
     const int C = (8-x) * (  y);\
@@ -313,7 +316,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
         }\
     }else{\
         const int E = B + C;\
-        const int step = C ? stride : 1;\
+        const ptrdiff_t step = C ? stride : 1;\
         for(i = 0; i < h; i++){\
             OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
             OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
@@ -325,7 +328,10 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     }\
 }\
 \
-static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
+                                        uint8_t *src/*align 1*/,\
+                                        ptrdiff_t stride, int h, int x, int y)\
+{\
     const int A = (8-x) * (8-y);\
     const int B = (  x) * (8-y);\
     const int C = (8-x) * (  y);\
@@ -350,7 +356,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
         }\
     }else{\
         const int E = B + C;\
-        const int step = C ? stride : 1;\
+        const ptrdiff_t step = C ? stride : 1;\
         for(i = 0; i < h; i++){\
             OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
             OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index eaadebe..9239a4a 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -782,7 +782,7 @@ PUT_VC1_MSPEL(3, 3)
       C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
                                         uint8_t *src /* align 1 */,
-                                        int stride, int h, int x, int y)
+                                        ptrdiff_t stride, int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
     const int B =     (x) * (8 - y);
@@ -807,7 +807,7 @@ static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
 }
 
 static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y)
+                                        ptrdiff_t stride, int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
     const int B =     (x) * (8 - y);
@@ -830,7 +830,7 @@ static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
 #define avg2(a, b) (((a) + (b) + 1) >> 1)
 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
                                         uint8_t *src /* align 1 */,
-                                        int stride, int h, int x, int y)
+                                        ptrdiff_t stride, int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
     const int B =     (x) * (8 - y);
@@ -856,7 +856,7 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
 
 static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
                                         uint8_t *src /* align 1 */,
-                                        int stride, int h, int x, int y)
+                                        ptrdiff_t stride, int h, int x, int y)
 {
     const int A = (8 - x) * (8 - y);
     const int B = (    x) * (8 - y);
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index fa698e5..b5a78b5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -105,11 +105,8 @@ SECTION .text
 %endif ; rv40
 ; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
 ;                                   uint8_t *src /* align 1 */,
-;                                   int stride, int h, int mx, int my)
+;                                   ptrdiff_t stride, int h, int mx, int my)
 cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
-%if ARCH_X86_64
-    movsxd        r2, r2d
-%endif
     mov          r6d, r5d
     or           r6d, r4d
     jne .at_least_one_non_zero
@@ -291,9 +288,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
 %endif ; PIC
 %endif ; rv40
 cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
-%if ARCH_X86_64
-    movsxd        r2, r2d
-%endif
     pxor          m7, m7
     movd          m2, r4d         ; x
     movd          m3, r5d         ; y
@@ -376,10 +370,6 @@ cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
 
 %macro chroma_mc2_mmx_func 2
 cglobal %1_%2_chroma_mc2, 6, 7, 0
-%if ARCH_X86_64
-    movsxd        r2, r2d
-%endif
-
     mov          r6d, r4d
     shl          r4d, 16
     sub          r4d, r6d
@@ -465,9 +455,6 @@ chroma_mc4_mmx_func avg, rv40
 
 %macro chroma_mc8_ssse3_func 2-3
 cglobal %1_%2_chroma_mc8%3, 6, 7, 8
-%if ARCH_X86_64
-    movsxd        r2, r2d
-%endif
     mov          r6d, r5d
     or           r6d, r4d
     jne .at_least_one_non_zero
@@ -613,9 +600,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
 
 %macro chroma_mc4_ssse3_func 2
 cglobal %1_%2_chroma_mc4, 6, 7, 0
-%if ARCH_X86_64
-    movsxd        r2, r2d
-%endif
     mov           r6, r4
     shl          r4d, 8
     sub          r4d, r6d
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index c358482..34bc419 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -57,12 +57,11 @@ SECTION .text
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h,
-;                                 int mx, int my)
+; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
 ;-----------------------------------------------------------------------------
 %macro CHROMA_MC8 1
 cglobal %1_h264_chroma_mc8_10, 6,7,8
-    movsxdifnidn  r2, r2d
     mov          r6d, r5d
     or           r6d, r4d
     jne .at_least_one_non_zero
@@ -149,8 +148,8 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h,
-;                                 int mx, int my)
+; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
 ;-----------------------------------------------------------------------------
 ;TODO: xmm mc4
 %macro MC4_OP 2
@@ -174,7 +173,6 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
 
 %macro CHROMA_MC4 1
 cglobal %1_h264_chroma_mc4_10, 6,6,7
-    movsxdifnidn  r2, r2d
     movd          m2, r4m         ; x
     movd          m3, r5m         ; y
     mova          m4, [pw_8]
@@ -200,12 +198,11 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h,
-;                                 int mx, int my)
+; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
+;                                 int h, int mx, int my)
 ;-----------------------------------------------------------------------------
 %macro CHROMA_MC2 1
 cglobal %1_h264_chroma_mc2_10, 6,7
-    movsxdifnidn  r2, r2d
     mov          r6d, r4d
     shl          r4d, 16
     sub          r4d, r6d
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index e08af27..36bf29d 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -25,38 +25,38 @@
 #include "libavcodec/h264chroma.h"
 
 void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 
 void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc4_mmxext   (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 
 void ff_put_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 
 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 
 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
-                                      int stride, int h, int x, int y);
+                                      ptrdiff_t stride, int h, int x, int y);
 
 #define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
                                       (uint8_t *dst, uint8_t *src,      \
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 
 CHROMA_MC(put, 2, 10, mmxext)
 CHROMA_MC(avg, 2, 10, mmxext)
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 218deb8..340173d 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -41,18 +41,18 @@ static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src,
 
 #if HAVE_YASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
-                                  int stride, int h, int x, int y);
+                                  ptrdiff_t stride, int h, int x, int y);
 void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
 void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
-                                  int stride, int h, int x, int y);
+                                  ptrdiff_t stride, int h, int x, int y);
 
 void ff_put_rv40_chroma_mc4_mmx  (uint8_t *dst, uint8_t *src,
-                                  int stride, int h, int x, int y);
+                                  ptrdiff_t stride, int h, int x, int y);
 void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
 void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
-                                  int stride, int h, int x, int y);
+                                  ptrdiff_t stride, int h, int x, int y);
 
 #define DECLARE_WEIGHT(opt) \
 void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index e05ae06..79d22a2 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -83,15 +83,15 @@ DECLARE_FUNCTION(avg_, 16, _sse2)
 #endif /* HAVE_YASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y);
+                                        ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
-                                       int stride, int h, int x, int y);
+                                       ptrdiff_t stride, int h, int x, int y);
 void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
 void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,


======================================================================

diff --cc libavcodec/arm/vc1dsp_init_neon.c
index c340144,08c07c4..005d45c
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@@ -37,52 -37,50 +37,52 @@@ void ff_vc1_inv_trans_4x4_dc_neon(uint8
  void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int rnd);
  
 -void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -
 -void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -
 -void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -
 -void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -
 -void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 -void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
 -                                ptrdiff_t stride, int rnd);
 +#define DECL_PUT(X, Y) \
 +void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
 +                                      ptrdiff_t stride, int rnd); \
 +static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
 +                                         ptrdiff_t stride, int rnd) \
 +{ \
 +  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
 +  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
 +  dst += 8*stride; src += 8*stride; \
 +  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
 +  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
 +}
 +
 +DECL_PUT(1, 0)
 +DECL_PUT(2, 0)
 +DECL_PUT(3, 0)
 +
 +DECL_PUT(0, 1)
 +DECL_PUT(0, 2)
 +DECL_PUT(0, 3)
 +
 +DECL_PUT(1, 1)
 +DECL_PUT(1, 2)
 +DECL_PUT(1, 3)
 +
 +DECL_PUT(2, 1)
 +DECL_PUT(2, 2)
 +DECL_PUT(2, 3)
 +
 +DECL_PUT(3, 1)
 +DECL_PUT(3, 2)
 +DECL_PUT(3, 3)
  
- void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
- void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
- void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
- void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
-                                 int x, int y);
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
  
 +#define FN_ASSIGN(X, Y) \
 +    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
 +    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
 +
  av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
  {
      dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
diff --cc libavcodec/h264chroma.h
index e0f45ad,9fc2a0f..5c89fd1
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@@ -19,13 -19,14 +19,14 @@@
  #ifndef AVCODEC_H264CHROMA_H
  #define AVCODEC_H264CHROMA_H
  
+ #include <stddef.h>
  #include <stdint.h>
  
- typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+ typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y);
  
  typedef struct H264ChromaContext {
 -    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
 -    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
 +    h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
 +    h264_chroma_mc_func avg_h264_chroma_pixels_tab[4];
  } H264ChromaContext;
  
  void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
diff --cc libavcodec/h264chroma_template.c
index 072b5e0,ed364dd..a3ca07b
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@@ -19,46 -19,14 +19,49 @@@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
- #include "libavutil/avassert.h"
 -#include <assert.h>
 +
+ #include <stddef.h>
+ 
++#include "libavutil/avassert.h"
  #include "bit_depth_template.c"
  
  #define H264_CHROMA_MC(OPNAME, OP)\
- static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
++static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
 +    pixel *dst = (pixel*)_dst;\
 +    pixel *src = (pixel*)_src;\
 +    const int A=(8-x)*(8-y);\
 +    const int B=(  x)*(8-y);\
 +    const int C=(8-x)*(  y);\
 +    const int D=(  x)*(  y);\
 +    int i;\
 +    stride >>= sizeof(pixel)-1;\
 +    \
 +    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 +\
 +    if(D){\
 +        for(i=0; i<h; i++){\
 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
 +            dst+= stride;\
 +            src+= stride;\
 +        }\
 +    } else if (B + C) {\
 +        const int E= B+C;\
 +        const int step= C ? stride : 1;\
 +        for(i=0; i<h; i++){\
 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
 +            dst+= stride;\
 +            src+= stride;\
 +        }\
 +    } else {\
 +        for(i=0; i<h; i++){\
 +            OP(dst[0], (A*src[0]));\
 +            dst+= stride;\
 +            src+= stride;\
 +        }\
 +    }\
 +}\
- static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+ {\
      pixel *dst = (pixel*)_dst;\
      pixel *src = (pixel*)_src;\
      const int A=(8-x)*(8-y);\
diff --cc libavcodec/mips/h264chroma_mips.h
index 6e6127d,0000000..996384d
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_mips.h
+++ b/libavcodec/mips/h264chroma_mips.h
@@@ -1,47 -1,0 +1,47 @@@
 +/*
 + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_MIPS_H264CHROMA_MIPS_H
 +#define AVCODEC_MIPS_H264CHROMA_MIPS_H
 +
 +#include "libavcodec/h264dec.h"
- void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
- void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
- void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
- void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
- void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
- void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +                                int height, int x, int y);
 +
- void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y);
- void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y);
- void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y);
- void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y);
 +
 +#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --cc libavcodec/mips/h264chroma_mmi.c
index 417b4a2,0000000..bafe0f9
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_mmi.c
+++ b/libavcodec/mips/h264chroma_mmi.c
@@@ -1,704 -1,0 +1,704 @@@
 +/*
 + * Loongson SIMD optimized h264chroma
 + *
 + * Copyright (c) 2015 Loongson Technology Corporation Limited
 + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
 + *                    Zhang Shuangshuang <zhangshuangshuang at ict.ac.cn>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "h264chroma_mips.h"
 +#include "constants.h"
 +#include "libavutil/mips/mmiutils.h"
 +
- void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B = x * (8 - y);
 +    const int C = (8 - x) * y;
 +    const int D = x * y;
 +    const int E = B + C;
 +    double ftmp[10];
 +    uint64_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_ALL64;
 +
 +    if (D) {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
 +            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
 +            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULDC1(%[ftmp2], %[src], 0x01)
 +            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
 +            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
 +            "pmullh     %[ftmp7],   %[ftmp7],       %[B]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[A]                \n\t"
 +            "pmullh     %[ftmp8],   %[ftmp8],       %[B]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
 +            "pmullh     %[ftmp7],   %[ftmp7],       %[D]                \n\t"
 +            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[C]                \n\t"
 +            "pmullh     %[ftmp8],   %[ftmp8],       %[D]                \n\t"
 +            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [B]"f"(B),
 +              [C]"f"(C),                    [D]"f"(D)
 +            : "memory"
 +        );
 +    } else if (E) {
 +        const int step = C ? stride : 1;
 +
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 +
 +            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[E]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]            \n\t"
 +            "pmullh     %[ftmp4],   %[ftmp4],       %[A]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[E]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
 +              [ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [E]"f"(E)
 +            : "memory"
 +        );
 +    } else {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x02               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A)
 +            : "memory"
 +        );
 +    }
 +}
 +
- void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B = x * (8 - y);
 +    const int C = (8 - x) * y;
 +    const int D = x * y;
 +    const int E = B + C;
 +    double ftmp[10];
 +    uint64_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_ALL64;
 +
 +    if (D) {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
 +            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
 +            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULDC1(%[ftmp2], %[src], 0x01)
 +            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
 +            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
 +            "pmullh     %[ftmp7],   %[ftmp7],       %[B]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[A]                \n\t"
 +            "pmullh     %[ftmp8],   %[ftmp8],       %[B]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
 +            "pmullh     %[ftmp7],   %[ftmp7],       %[D]                \n\t"
 +            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[C]                \n\t"
 +            "pmullh     %[ftmp8],   %[ftmp8],       %[D]                \n\t"
 +            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            MMI_LDC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [B]"f"(B),
 +              [C]"f"(C),                    [D]"f"(D)
 +            : "memory"
 +        );
 +    } else if (E) {
 +        const int step = C ? stride : 1;
 +
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 +
 +            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[E]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]            \n\t"
 +            "pmullh     %[ftmp4],   %[ftmp4],       %[A]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[E]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            MMI_LDC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
 +              [ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [E]"f"(E)
 +            : "memory"
 +        );
 +    } else {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            MMI_LDC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +
 +            MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
 +            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            MMI_LDC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x02               \n\t"
 +            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_ALL64
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A)
 +            : "memory"
 +        );
 +    }
 +}
 +
- void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B = x * (8 - y);
 +    const int C = (8 - x) *  y;
 +    const int D = x *  y;
 +    const int E = B + C;
 +    double ftmp[8];
 +    uint64_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_LOW32;
 +
 +    if (D) {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
 +            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
 +            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULWC1(%[ftmp2], %[src], 0x01)
 +            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
 +            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [B]"f"(B),
 +              [C]"f"(C),                    [D]"f"(D)
 +            : "memory"
 +        );
 +    } else if (E) {
 +        const int step = C ? stride : 1;
 +
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 +
 +            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
 +            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
 +              [ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [E]"f"(E)
 +            : "memory"
 +        );
 +    } else {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "addi       %[h],       %[h],           -0x02               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A)
 +            : "memory"
 +        );
 +    }
 +}
 +
- void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 +        int h, int x, int y)
 +{
 +    const int A = (8 - x) *(8 - y);
 +    const int B = x * (8 - y);
 +    const int C = (8 - x) * y;
 +    const int D = x * y;
 +    const int E = B + C;
 +    double ftmp[8];
 +    uint64_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_LOW32;
 +
 +    if (D) {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
 +            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
 +            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
 +
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULWC1(%[ftmp2], %[src], 0x01)
 +            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
 +            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
 +
 +            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
 +            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
 +            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            MMI_LWC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [B]"f"(B),
 +              [C]"f"(C),                    [D]"f"(D)
 +            : "memory"
 +        );
 +    } else if (E) {
 +        const int step = C ? stride : 1;
 +
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
 +            "1:                                                         \n\t"
 +            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 +
 +            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
 +            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
 +
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            MMI_LWC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x01               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [addr0]"=&r"(addr[0]),
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
 +              [ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A),                    [E]"f"(E)
 +            : "memory"
 +        );
 +    } else {
 +        __asm__ volatile (
 +            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 +            "dli        %[tmp0],    0x06                                \n\t"
 +            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
 +            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
 +
 +            "1:                                                         \n\t"
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            MMI_LWC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +
 +            MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
 +            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
 +            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
 +            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
 +            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
 +            MMI_LWC1(%[ftmp2], %[dst], 0x00)
 +            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
 +            "addi       %[h],       %[h],           -0x02               \n\t"
 +            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +
 +            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
 +            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 +            "bnez       %[h],       1b                                  \n\t"
 +            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +              [tmp0]"=&r"(tmp[0]),
 +              RESTRICT_ASM_LOW32
 +              [dst]"+&r"(dst),              [src]"+&r"(src),
 +              [h]"+&r"(h)
 +            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
 +              [A]"f"(A)
 +            : "memory"
 +        );
 +    }
 +}
diff --cc libavcodec/mips/h264chroma_msa.c
index 67d0bc1,0000000..940e12d
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@@ -1,2003 -1,0 +1,2003 @@@
 +/*
 + * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil at imgtec.com)
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavutil/mips/generic_macros_msa.h"
 +#include "h264chroma_mips.h"
 +
 +static const uint8_t chroma_mask_arr[16 * 5] = {
 +    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 +    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
 +    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 +    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
 +    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
 +};
 +
 +static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint16_t out0, out1;
 +    v16i8 src0, src1;
 +    v8u16 res_r;
 +    v8i16 res;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    LD_SB2(src, src_stride, src0, src1);
 +
 +    src0 = __msa_vshf_b(mask, src1, src0);
 +    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    out0 = __msa_copy_u_h(res, 0);
 +    out1 = __msa_copy_u_h(res, 2);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3;
 +    v8u16 res_r;
 +    v8i16 res;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[64]);
 +
 +    LD_UB4(src, src_stride, src0, src1, src2, src3);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +
 +    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
 +
 +    res_r = __msa_dotp_u_h(src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
 +    v8u16 res_r;
 +    v8i16 res;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[64]);
 +
 +    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
 +
 +    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
 +
 +    res_r = __msa_dotp_u_h(src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +
 +    res_r = __msa_dotp_u_h(src4, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else if (4 == height) {
 +        avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else if (8 == height) {
 +        avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    }
 +}
 +
 +static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16i8 src0, src1;
 +    v8u16 res_r;
 +    v4i32 res;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    LD_SB2(src, src_stride, src0, src1);
 +
 +    src0 = __msa_vshf_b(mask, src1, src0);
 +    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST4x2_UB(res, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride,
 +                                          uint8_t *dst, int32_t dst_stride,
 +                                          uint32_t coeff0, uint32_t coeff1,
 +                                          int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3;
 +    v8u16 res0_r, res1_r;
 +    v4i32 res0, res1;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src0, src1, src2, src3);
 +        src += (4 * src_stride);
 +
 +        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
 +
 +        res0_r <<= 3;
 +        res1_r <<= 3;
 +
 +        SRARI_H2_UH(res0_r, res1_r, 6);
 +        SAT_UH2_UH(res0_r, res1_r, 7);
 +        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
 +
 +        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +    }
 +}
 +
 +static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hz_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else {
 +        avc_chroma_hz_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
 +                                      coeff1, height);
 +    }
 +}
 +
 +static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, out0, out1;
 +    v8u16 res0, res1, res2, res3;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[32]);
 +
 +    for (row = height >> 2; row--;) {
 +        LD_UB4(src, src_stride, src0, src1, src2, src3);
 +        src += (4 * src_stride);
 +
 +        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
 +        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 +                    coeff_vec, res0, res1, res2, res3);
 +        SLLI_4V(res0, res1, res2, res3, 3);
 +        SRARI_H4_UH(res0, res1, res2, res3, 6);
 +        SAT_UH4_UH(res0, res1, res2, res3, 7);
 +        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +    }
 +
 +    if (0 != (height % 4)) {
 +        for (row = (height % 4); row--;) {
 +            src0 = LD_UB(src);
 +            src += src_stride;
 +
 +            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
 +
 +            res0 = __msa_dotp_u_h(src0, coeff_vec);
 +            res0 <<= 3;
 +            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
 +            res0 = __msa_sat_u_h(res0, 7);
 +            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
 +
 +            ST8x1_UB(res0, dst);
 +            dst += dst_stride;
 +        }
 +    }
 +}
 +
 +static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint16_t out0, out1;
 +    v16i8 src0, src1, src2;
 +    v16u8 tmp0, tmp1;
 +    v8i16 res;
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_SB3(src, src_stride, src0, src1, src2);
 +
 +    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    out0 = __msa_copy_u_h(res, 0);
 +    out1 = __msa_copy_u_h(res, 2);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8i16 res;
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +               tmp0, tmp1, tmp2, tmp3);
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8i16 res;
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +    src += (5 * src_stride);
 +    LD_UB4(src, src_stride, src5, src6, src7, src8);
 +
 +    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +               tmp0, tmp1, tmp2, tmp3);
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +
 +    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 +               tmp0, tmp1, tmp2, tmp3);
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +}
 +
 +static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else if (4 == height) {
 +        avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else if (8 == height) {
 +        avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    }
 +}
 +
 +static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2;
 +    v16u8 tmp0, tmp1;
 +    v4i32 res;
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_UB3(src, src_stride, src0, src1, src2);
 +    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +
 +    ST4x2_UB(res, dst, dst_stride);
 +}
 +
 +static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
 +                                          uint8_t *dst, int32_t dst_stride,
 +                                          uint32_t coeff0, uint32_t coeff1,
 +                                          int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8u16 res0_r, res1_r;
 +    v4i32 res0, res1;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +                   tmp0, tmp1, tmp2, tmp3);
 +        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 +
 +        res0_r <<= 3;
 +        res1_r <<= 3;
 +
 +        SRARI_H2_UH(res0_r, res1_r, 6);
 +        SAT_UH2_UH(res0_r, res1_r, 7);
 +        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
 +
 +        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_vt_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
 +    } else {
 +        avc_chroma_vt_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
 +                                      coeff1, height);
 +    }
 +}
 +
 +static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coeff0, uint32_t coeff1,
 +                                 int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4, out0, out1;
 +    v8u16 res0, res1, res2, res3;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    for (row = height >> 2; row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +                   src0, src1, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 +                    coeff_vec, res0, res1, res2, res3);
 +        SLLI_4V(res0, res1, res2, res3, 3);
 +        SRARI_H4_UH(res0, res1, res2, res3, 6);
 +        SAT_UH4_UH(res0, res1, res2, res3, 7);
 +        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
 +
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coef_hor0, uint32_t coef_hor1,
 +                                  uint32_t coef_ver0, uint32_t coef_ver1)
 +{
 +    uint16_t out0, out1;
 +    v16u8 src0, src1, src2;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v8i16 res_vert;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB3(src, src_stride, src0, src1, src2);
 +    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    out0 = __msa_copy_u_h(res_vert, 0);
 +    out1 = __msa_copy_u_h(res_vert, 1);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coef_hor0, uint32_t coef_hor1,
 +                                  uint32_t coef_ver0, uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v8i16 res;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coef_hor0, uint32_t coef_hor1,
 +                                  uint32_t coef_ver0, uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v8i16 res;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +    src += (5 * src_stride);
 +    LD_UB4(src, src_stride, src5, src6, src7, src8);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 +    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +
 +    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coef_hor0, uint32_t coef_hor1,
 +                                 uint32_t coef_ver0, uint32_t coef_ver1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
 +                              coef_hor1, coef_ver0, coef_ver1);
 +    } else if (4 == height) {
 +        avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0,
 +                              coef_hor1, coef_ver0, coef_ver1);
 +    } else if (8 == height) {
 +        avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0,
 +                              coef_hor1, coef_ver0, coef_ver1);
 +    }
 +}
 +
 +static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                  uint8_t *dst, int32_t dst_stride,
 +                                  uint32_t coef_hor0, uint32_t coef_hor1,
 +                                  uint32_t coef_ver0, uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v16i8 mask;
 +    v4i32 res;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +    LD_UB3(src, src_stride, src0, src1, src2);
 +    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    ST4x2_UB(res, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
 +                                          uint8_t *dst, int32_t dst_stride,
 +                                          uint32_t coef_hor0,
 +                                          uint32_t coef_hor1,
 +                                          uint32_t coef_ver0,
 +                                          uint32_t coef_ver1,
 +                                          int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4;
 +    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 +    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +    v4i32 res0, res1;
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 +                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 +                    res_hz3);
 +        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 +             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 +             res_vt3);
 +        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 +        SRARI_H2_UH(res_vt0, res_vt1, 6);
 +        SAT_UH2_UH(res_vt0, res_vt1, 7);
 +        PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
 +
 +        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coef_hor0, uint32_t coef_hor1,
 +                                 uint32_t coef_ver0, uint32_t coef_ver1,
 +                                 int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
 +                              coef_hor1, coef_ver0, coef_ver1);
 +    } else {
 +        avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride,
 +                                      coef_hor0, coef_hor1, coef_ver0,
 +                                      coef_ver1, height);
 +    }
 +}
 +
 +static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
 +                                 uint8_t *dst, int32_t dst_stride,
 +                                 uint32_t coef_hor0, uint32_t coef_hor1,
 +                                 uint32_t coef_ver0, uint32_t coef_ver1,
 +                                 int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4, out0, out1;
 +    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
 +    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[32]);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
 +    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
 +        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
 +        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
 +                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 +                    res_hz4);
 +        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 +             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 +             res_vt3);
 +
 +        res_vt0 += (res_hz0 * coeff_vt_vec1);
 +        res_vt1 += (res_hz1 * coeff_vt_vec1);
 +        res_vt2 += (res_hz2 * coeff_vt_vec1);
 +        res_vt3 += (res_hz3 * coeff_vt_vec1);
 +
 +        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 +        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 +        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +
 +        dst += (4 * dst_stride);
 +
 +        res_hz0 = res_hz4;
 +    }
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint16_t out0, out1;
 +    uint32_t load0, load1;
 +    v16i8 src0, src1;
 +    v16u8 dst_data = { 0 };
 +    v8u16 res_r;
 +    v16u8 res;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    LD_SB2(src, src_stride, src0, src1);
 +
 +    load0 = LW(dst);
 +    load1 = LW(dst + dst_stride);
 +
 +    INSERT_W2_UB(load0, load1, dst_data);
 +
 +    src0 = __msa_vshf_b(mask, src1, src0);
 +
 +    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    dst_data = __msa_aver_u_b(res, dst_data);
 +
 +    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
 +    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v8u16 res_r;
 +    v16i8 res, mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[64]);
 +
 +    LD_UB4(src, src_stride, src0, src1, src2, src3);
 +    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +
 +    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
 +
 +    res_r = __msa_dotp_u_h(src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    dst0 = __msa_aver_u_b((v16u8) res, dst0);
 +
 +    ST2x4_UB(dst0, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
 +    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 +    v8u16 res0_r, res1_r;
 +    v16u8 res0, res1, mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_UB(&chroma_mask_arr[64]);
 +
 +    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 +    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 +
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 +
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
 +    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
 +    DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
 +
 +    res0_r <<= 3;
 +    res1_r <<= 3;
 +
 +    SRARI_H2_UH(res0_r, res1_r, 6);
 +    SAT_UH2_UH(res0_r, res1_r, 7);
 +    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 +    AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
 +
 +    ST2x4_UB(dst0, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +    ST2x4_UB(dst4, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else if (4 == height) {
 +        avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else if (8 == height) {
 +        avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    }
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint32_t load0, load1;
 +    v16i8 src0, src1;
 +    v16u8 dst_data = { 0 };
 +    v8u16 res_r;
 +    v16i8 res, mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    LD_SB2(src, src_stride, src0, src1);
 +
 +    load0 = LW(dst);
 +    load1 = LW(dst + dst_stride);
 +
 +    INSERT_W2_UB(load0, load1, dst_data);
 +
 +    src0 = __msa_vshf_b(mask, src1, src0);
 +
 +    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 +
 +    ST4x2_UB(dst_data, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src,
 +                                                       int32_t src_stride,
 +                                                       uint8_t *dst,
 +                                                       int32_t dst_stride,
 +                                                       uint32_t coeff0,
 +                                                       uint32_t coeff1,
 +                                                       int32_t height)
 +{
 +    uint32_t load0, load1;
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3;
 +    v16u8 dst0 = { 0 };
 +    v16u8 dst1 = { 0 };
 +    v8u16 res0_r, res1_r;
 +    v16u8 res0, res1, mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_UB(&chroma_mask_arr[0]);
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src0, src1, src2, src3);
 +        src += (4 * src_stride);
 +
 +        load0 = LW(dst);
 +        load1 = LW(dst + dst_stride);
 +
 +        INSERT_W2_UB(load0, load1, dst0);
 +
 +        load0 = LW(dst + 2 * dst_stride);
 +        load1 = LW(dst + 3 * dst_stride);
 +
 +        INSERT_W2_UB(load0, load1, dst1);
 +
 +        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 +        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
 +
 +        res0_r <<= 3;
 +        res1_r <<= 3;
 +
 +        SRARI_H2_UH(res0_r, res1_r, 6);
 +        SAT_UH2_UH(res0_r, res1_r, 7);
 +        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 +        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
 +
 +        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +    }
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else {
 +        avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride,
 +                                                   dst, dst_stride,
 +                                                   coeff0, coeff1, height);
 +    }
 +}
 +
 +static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, out0, out1;
 +    v8u16 res0, res1, res2, res3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v16i8 mask;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    mask = LD_SB(&chroma_mask_arr[32]);
 +
 +    for (row = height >> 2; row--;) {
 +        LD_UB4(src, src_stride, src0, src1, src2, src3);
 +        src += (4 * src_stride);
 +        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
 +        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 +                    coeff_vec, res0, res1, res2, res3);
 +        SLLI_4V(res0, res1, res2, res3, 3);
 +        SRARI_H4_UH(res0, res1, res2, res3, 6);
 +        SAT_UH4_UH(res0, res1, res2, res3, 7);
 +        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
 +        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 +        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +    }
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint16_t out0, out1;
 +    uint32_t load0, load1;
 +    v16i8 src0, src1, src2, tmp0, tmp1, res;
 +    v16u8 dst_data = { 0 };
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_SB3(src, src_stride, src0, src1, src2);
 +    load0 = LW(dst);
 +    load1 = LW(dst + dst_stride);
 +
 +    INSERT_W2_UB(load0, load1, dst_data);
 +
 +    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 +
 +    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 +    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 +    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
 +    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint32_t load0, load1;
 +    v16i8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8u16 res_r;
 +    v8i16 res;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +    v16u8 dst_data = { 0 };
 +
 +    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 +
 +    load0 = LW(dst);
 +    load1 = LW(dst + dst_stride);
 +
 +    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
 +    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
 +
 +    load0 = LW(dst + 2 * dst_stride);
 +    load1 = LW(dst + 3 * dst_stride);
 +
 +    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
 +    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
 +
 +    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +               tmp0, tmp1, tmp2, tmp3);
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint32_t load0, load1, load2, load3;
 +    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8i16 res;
 +    v8u16 res_r;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +    v16u8 dst_data0 = { 0 };
 +    v16u8 dst_data1 = { 0 };
 +
 +    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 +    src += (5 * src_stride);
 +    LD_SB4(src, src_stride, src5, src6, src7, src8);
 +
 +    LW4(dst, dst_stride, load0, load1, load2, load3);
 +
 +    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
 +    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
 +    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
 +    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
 +
 +    LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
 +
 +    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
 +    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
 +    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
 +    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
 +
 +    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +               tmp0, tmp1, tmp2, tmp3);
 +
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +
 +    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 +               tmp0, tmp1, tmp2, tmp3);
 +
 +    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +
 +    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +
 +    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
 +
 +    ST2x4_UB(res, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else if (4 == height) {
 +        avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else if (8 == height) {
 +        avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    }
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coeff0, uint32_t coeff1)
 +{
 +    uint32_t load0, load1;
 +    v16i8 src0, src1, src2, tmp0, tmp1;
 +    v16u8 dst_data = { 0 };
 +    v8u16 res_r;
 +    v16u8 res;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    LD_SB3(src, src_stride, src0, src1, src2);
 +
 +    load0 = LW(dst);
 +    load1 = LW(dst + dst_stride);
 +
 +    INSERT_W2_UB(load0, load1, dst_data);
 +    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 +
 +    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 +
 +    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
 +    res_r <<= 3;
 +    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 +    res_r = __msa_sat_u_h(res_r, 7);
 +    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 +    res = __msa_aver_u_b(res, dst_data);
 +
 +    ST4x2_UB(res, dst, dst_stride);
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
 +                                                  int32_t src_stride,
 +                                                  uint8_t *dst,
 +                                                  int32_t dst_stride,
 +                                                  uint32_t coeff0,
 +                                                  uint32_t coeff1,
 +                                                  int32_t height)
 +{
 +    uint32_t load0, load1, row;
 +    v16i8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v16u8 dst0 = { 0 };
 +    v16u8 dst1 = { 0 };
 +    v8u16 res0_r, res1_r;
 +    v16u8 res0, res1;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    src0 = LD_SB(src);
 +    src += src_stride;
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_SB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        load0 = LW(dst);
 +        load1 = LW(dst + dst_stride);
 +
 +        INSERT_W2_UB(load0, load1, dst0);
 +        load0 = LW(dst + 2 * dst_stride);
 +        load1 = LW(dst + 3 * dst_stride);
 +        INSERT_W2_UB(load0, load1, dst1);
 +
 +        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +                   tmp0, tmp1, tmp2, tmp3);
 +        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 +        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 +
 +        res0_r <<= 3;
 +        res1_r <<= 3;
 +
 +        SRARI_H2_UH(res0_r, res1_r, 6);
 +        SAT_UH2_UH(res0_r, res1_r, 7);
 +        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 +        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
 +
 +        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
 +                                           coeff0, coeff1);
 +    } else {
 +        avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
 +                                              coeff0, coeff1, height);
 +    }
 +}
 +
 +static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coeff0, uint32_t coeff1,
 +                                              int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 out0, out1;
 +    v8u16 res0, res1, res2, res3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 +    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 +    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    for (row = height >> 2; row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 +                   src0, src1, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 +                    coeff_vec, res0, res1, res2, res3);
 +        SLLI_4V(res0, res1, res2, res3, 3);
 +        SRARI_H4_UH(res0, res1, res2, res3, 6);
 +        SAT_UH4_UH(res0, res1, res2, res3, 7);
 +        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
 +        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 +        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coef_hor0,
 +                                               uint32_t coef_hor1,
 +                                               uint32_t coef_ver0,
 +                                               uint32_t coef_ver1)
 +{
 +    uint16_t out0, out1;
 +    v16u8 dst0, dst1;
 +    v16u8 src0, src1, src2;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v16i8 res, mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB3(src, src_stride, src0, src1, src2);
 +    LD_UB2(dst, dst_stride, dst0, dst1);
 +    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 +    dst0 = __msa_aver_u_b((v16u8) res, dst0);
 +    out0 = __msa_copy_u_h((v8i16) dst0, 0);
 +    out1 = __msa_copy_u_h((v8i16) dst0, 1);
 +
 +    SH(out0, dst);
 +    dst += dst_stride;
 +    SH(out1, dst);
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coef_hor0,
 +                                               uint32_t coef_hor1,
 +                                               uint32_t coef_ver0,
 +                                               uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v16i8 res, mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 +    dst0 = __msa_aver_u_b((v16u8) res, dst0);
 +
 +    ST2x4_UB(dst0, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coef_hor0,
 +                                               uint32_t coef_hor1,
 +                                               uint32_t coef_ver0,
 +                                               uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 +    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 +    v16u8 tmp0, tmp1, tmp2, tmp3;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v16i8 res, mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[48]);
 +
 +    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 +    src += (5 * src_stride);
 +    LD_UB4(src, src_stride, src5, src6, src7, src8);
 +
 +    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 +
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 +    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 +
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
 +    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
 +
 +    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 +    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
 +    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
 +    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +    dst0 = __msa_aver_u_b((v16u8) res, dst0);
 +
 +    ST2x4_UB(dst0, 0, dst, dst_stride);
 +    dst += (4 * dst_stride);
 +
 +    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +    dst4 = __msa_aver_u_b((v16u8) res, dst4);
 +
 +    ST2x4_UB(dst4, 0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coef_hor0,
 +                                              uint32_t coef_hor1,
 +                                              uint32_t coef_ver0,
 +                                              uint32_t coef_ver1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
 +                                           coef_hor0, coef_hor1,
 +                                           coef_ver0, coef_ver1);
 +    } else if (4 == height) {
 +        avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
 +                                           coef_hor0, coef_hor1,
 +                                           coef_ver0, coef_ver1);
 +    } else if (8 == height) {
 +        avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
 +                                           coef_hor0, coef_hor1,
 +                                           coef_ver0, coef_ver1);
 +    }
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
 +                                               uint8_t *dst, int32_t dst_stride,
 +                                               uint32_t coef_hor0,
 +                                               uint32_t coef_hor1,
 +                                               uint32_t coef_ver0,
 +                                               uint32_t coef_ver1)
 +{
 +    v16u8 src0, src1, src2;
 +    v16u8 dst0, dst1;
 +    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 +    v16i8 res, mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    LD_UB3(src, src_stride, src0, src1, src2);
 +    LD_UB2(dst, dst_stride, dst0, dst1);
 +    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 +    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 +
 +    res_vt0 += res_vt1;
 +    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 +    res_vt0 = __msa_sat_u_h(res_vt0, 7);
 +    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 +    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
 +    dst0 = __msa_aver_u_b((v16u8) res, dst0);
 +
 +    ST4x2_UB(dst0, dst, dst_stride);
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
 +                                                  int32_t src_stride,
 +                                                  uint8_t *dst,
 +                                                  int32_t dst_stride,
 +                                                  uint32_t coef_hor0,
 +                                                  uint32_t coef_hor1,
 +                                                  uint32_t coef_ver0,
 +                                                  uint32_t coef_ver1,
 +                                                  int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 +    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +    v16u8 res0, res1;
 +
 +    mask = LD_SB(&chroma_mask_arr[0]);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +
 +        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 +        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
 +        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 +                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 +                    res_hz3);
 +        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 +             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 +             res_vt3);
 +        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 +        SRARI_H2_UH(res_vt0, res_vt1, 6);
 +        SAT_UH2_UH(res_vt0, res_vt1, 7);
 +        PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
 +
 +        dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
 +        dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
 +
 +        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
 +
 +        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +        src0 = src4;
 +    }
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coef_hor0,
 +                                              uint32_t coef_hor1,
 +                                              uint32_t coef_ver0,
 +                                              uint32_t coef_ver1,
 +                                              int32_t height)
 +{
 +    if (2 == height) {
 +        avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
 +                                           coef_hor0, coef_hor1,
 +                                           coef_ver0, coef_ver1);
 +    } else {
 +        avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
 +                                              coef_hor0, coef_hor1,
 +                                              coef_ver0, coef_ver1, height);
 +    }
 +}
 +
 +static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
 +                                              uint8_t *dst, int32_t dst_stride,
 +                                              uint32_t coef_hor0,
 +                                              uint32_t coef_hor1,
 +                                              uint32_t coef_ver0,
 +                                              uint32_t coef_ver1,
 +                                              int32_t height)
 +{
 +    uint32_t row;
 +    v16u8 src0, src1, src2, src3, src4, out0, out1;
 +    v8u16 res_hz0, res_hz1, res_hz2;
 +    v8u16 res_hz3, res_hz4;
 +    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +    v16i8 mask;
 +    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 +    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 +    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 +    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 +    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 +
 +    mask = LD_SB(&chroma_mask_arr[32]);
 +
 +    src0 = LD_UB(src);
 +    src += src_stride;
 +
 +    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
 +    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
 +
 +    for (row = (height >> 2); row--;) {
 +        LD_UB4(src, src_stride, src1, src2, src3, src4);
 +        src += (4 * src_stride);
 +
 +        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
 +        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
 +        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
 +                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 +                    res_hz4);
 +        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 +             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 +             res_vt3);
 +
 +        res_vt0 += (res_hz0 * coeff_vt_vec1);
 +        res_vt1 += (res_hz1 * coeff_vt_vec1);
 +        res_vt2 += (res_hz2 * coeff_vt_vec1);
 +        res_vt3 += (res_hz3 * coeff_vt_vec1);
 +
 +        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 +        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 +
 +        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 +        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 +        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
 +        ST8x4_UB(out0, out1, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +
 +        res_hz0 = res_hz4;
 +    }
 +}
 +
 +static void copy_width8_msa(uint8_t *src, int32_t src_stride,
 +                            uint8_t *dst, int32_t dst_stride,
 +                            int32_t height)
 +{
 +    int32_t cnt;
 +    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
 +    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
 +
 +    if (0 == height % 12) {
 +        for (cnt = (height / 12); cnt--;) {
 +            LD_UB8(src, src_stride,
 +                   src0, src1, src2, src3, src4, src5, src6, src7);
 +            src += (8 * src_stride);
 +
 +            out0 = __msa_copy_u_d((v2i64) src0, 0);
 +            out1 = __msa_copy_u_d((v2i64) src1, 0);
 +            out2 = __msa_copy_u_d((v2i64) src2, 0);
 +            out3 = __msa_copy_u_d((v2i64) src3, 0);
 +            out4 = __msa_copy_u_d((v2i64) src4, 0);
 +            out5 = __msa_copy_u_d((v2i64) src5, 0);
 +            out6 = __msa_copy_u_d((v2i64) src6, 0);
 +            out7 = __msa_copy_u_d((v2i64) src7, 0);
 +
 +            SD4(out0, out1, out2, out3, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +            SD4(out4, out5, out6, out7, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +
 +            LD_UB4(src, src_stride, src0, src1, src2, src3);
 +            src += (4 * src_stride);
 +
 +            out0 = __msa_copy_u_d((v2i64) src0, 0);
 +            out1 = __msa_copy_u_d((v2i64) src1, 0);
 +            out2 = __msa_copy_u_d((v2i64) src2, 0);
 +            out3 = __msa_copy_u_d((v2i64) src3, 0);
 +
 +            SD4(out0, out1, out2, out3, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +        }
 +    } else if (0 == height % 8) {
 +        for (cnt = height >> 3; cnt--;) {
 +            LD_UB8(src, src_stride,
 +                   src0, src1, src2, src3, src4, src5, src6, src7);
 +            src += (8 * src_stride);
 +
 +            out0 = __msa_copy_u_d((v2i64) src0, 0);
 +            out1 = __msa_copy_u_d((v2i64) src1, 0);
 +            out2 = __msa_copy_u_d((v2i64) src2, 0);
 +            out3 = __msa_copy_u_d((v2i64) src3, 0);
 +            out4 = __msa_copy_u_d((v2i64) src4, 0);
 +            out5 = __msa_copy_u_d((v2i64) src5, 0);
 +            out6 = __msa_copy_u_d((v2i64) src6, 0);
 +            out7 = __msa_copy_u_d((v2i64) src7, 0);
 +
 +            SD4(out0, out1, out2, out3, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +            SD4(out4, out5, out6, out7, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +        }
 +    } else if (0 == height % 4) {
 +        for (cnt = (height / 4); cnt--;) {
 +            LD_UB4(src, src_stride, src0, src1, src2, src3);
 +            src += (4 * src_stride);
 +            out0 = __msa_copy_u_d((v2i64) src0, 0);
 +            out1 = __msa_copy_u_d((v2i64) src1, 0);
 +            out2 = __msa_copy_u_d((v2i64) src2, 0);
 +            out3 = __msa_copy_u_d((v2i64) src3, 0);
 +
 +            SD4(out0, out1, out2, out3, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +        }
 +    } else if (0 == height % 2) {
 +        for (cnt = (height / 2); cnt--;) {
 +            LD_UB2(src, src_stride, src0, src1);
 +            src += (2 * src_stride);
 +            out0 = __msa_copy_u_d((v2i64) src0, 0);
 +            out1 = __msa_copy_u_d((v2i64) src1, 0);
 +
 +            SD(out0, dst);
 +            dst += dst_stride;
 +            SD(out1, dst);
 +            dst += dst_stride;
 +        }
 +    }
 +}
 +
 +static void avg_width4_msa(uint8_t *src, int32_t src_stride,
 +                           uint8_t *dst, int32_t dst_stride,
 +                           int32_t height)
 +{
 +    int32_t cnt;
 +    uint32_t out0, out1, out2, out3;
 +    v16u8 src0, src1, src2, src3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +
 +    if (0 == (height % 4)) {
 +        for (cnt = (height / 4); cnt--;) {
 +            LD_UB4(src, src_stride, src0, src1, src2, src3);
 +            src += (4 * src_stride);
 +
 +            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +
 +            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 +                        dst0, dst1, dst2, dst3);
 +
 +            out0 = __msa_copy_u_w((v4i32) dst0, 0);
 +            out1 = __msa_copy_u_w((v4i32) dst1, 0);
 +            out2 = __msa_copy_u_w((v4i32) dst2, 0);
 +            out3 = __msa_copy_u_w((v4i32) dst3, 0);
 +            SW4(out0, out1, out2, out3, dst, dst_stride);
 +            dst += (4 * dst_stride);
 +        }
 +    } else if (0 == (height % 2)) {
 +        for (cnt = (height / 2); cnt--;) {
 +            LD_UB2(src, src_stride, src0, src1);
 +            src += (2 * src_stride);
 +
 +            LD_UB2(dst, dst_stride, dst0, dst1);
 +
 +            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
 +
 +            out0 = __msa_copy_u_w((v4i32) dst0, 0);
 +            out1 = __msa_copy_u_w((v4i32) dst1, 0);
 +            SW(out0, dst);
 +            dst += dst_stride;
 +            SW(out1, dst);
 +            dst += dst_stride;
 +        }
 +    }
 +}
 +
 +static void avg_width8_msa(uint8_t *src, int32_t src_stride,
 +                           uint8_t *dst, int32_t dst_stride,
 +                           int32_t height)
 +{
 +    int32_t cnt;
 +    uint64_t out0, out1, out2, out3;
 +    v16u8 src0, src1, src2, src3;
 +    v16u8 dst0, dst1, dst2, dst3;
 +
 +    for (cnt = (height / 4); cnt--;) {
 +        LD_UB4(src, src_stride, src0, src1, src2, src3);
 +        src += (4 * src_stride);
 +        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 +
 +        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 +                    dst0, dst1, dst2, dst3);
 +
 +        out0 = __msa_copy_u_d((v2i64) dst0, 0);
 +        out1 = __msa_copy_u_d((v2i64) dst1, 0);
 +        out2 = __msa_copy_u_d((v2i64) dst2, 0);
 +        out3 = __msa_copy_u_d((v2i64) dst3, 0);
 +        SD4(out0, out1, out2, out3, dst, dst_stride);
 +        dst += (4 * dst_stride);
 +    }
 +}
 +
 +void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    if (x && y) {
 +        avc_chroma_hv_8w_msa(src, stride, dst,
 +                             stride, x, (8 - x), y, (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_8w_msa(src, stride, dst, stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_8w_msa(src, stride, dst, stride, y, (8 - y), height);
 +    } else {
 +        copy_width8_msa(src, stride, dst, stride, height);
 +    }
 +}
 +
 +void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    int32_t cnt;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    if (x && y) {
 +        avc_chroma_hv_4w_msa(src, stride, dst,
 +                             stride, x, (8 - x), y, (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_4w_msa(src, stride, dst, stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_4w_msa(src, stride, dst, stride, y, (8 - y), height);
 +    } else {
 +        for (cnt = height; cnt--;) {
 +            *((uint32_t *) dst) = *((uint32_t *) src);
 +
 +            src += stride;
 +            dst += stride;
 +        }
 +    }
 +}
 +
 +void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    int32_t cnt;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    if (x && y) {
 +        avc_chroma_hv_2w_msa(src, stride, dst,
 +                             stride, x, (8 - x), y, (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_2w_msa(src, stride, dst, stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_2w_msa(src, stride, dst, stride, y, (8 - y), height);
 +    } else {
 +        for (cnt = height; cnt--;) {
 +            *((uint16_t *) dst) = *((uint16_t *) src);
 +
 +            src += stride;
 +            dst += stride;
 +        }
 +    }
 +}
 +
 +void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +
 +    if (x && y) {
 +        avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), y,
 +                                          (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
 +                                          stride, y, (8 - y), height);
 +    } else {
 +        avg_width8_msa(src, stride, dst, stride, height);
 +    }
 +}
 +
 +void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    if (x && y) {
 +        avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), y,
 +                                          (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
 +                                          stride, y, (8 - y), height);
 +    } else {
 +        avg_width4_msa(src, stride, dst, stride, height);
 +    }
 +}
 +
 +void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
-                                 int stride, int height, int x, int y)
++                                ptrdiff_t stride, int height, int x, int y)
 +{
 +    int32_t cnt;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    if (x && y) {
 +        avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), y,
 +                                          (8 - y), height);
 +    } else if (x) {
 +        avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
 +                                          stride, x, (8 - x), height);
 +    } else if (y) {
 +        avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
 +                                          stride, y, (8 - y), height);
 +    } else {
 +        for (cnt = height; cnt--;) {
 +            dst[0] = (dst[0] + src[0] + 1) >> 1;
 +            dst[1] = (dst[1] + src[1] + 1) >> 1;
 +
 +            src += stride;
 +            dst += stride;
 +        }
 +    }
 +}
diff --cc libavcodec/ppc/h264chroma_template.c
index cb1e095,daa7652..d9b2a61
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@@ -72,46 -70,11 +72,48 @@@
  #define noop(a) a
  #define add28(a) vec_add(v28ss, a)
  
 +#if HAVE_BIGENDIAN
 +#define GET_VSRC1(vs0, off, b, perm0, s){    \
 +    vec_u8 vsrcCuc, vsrcDuc;                 \
 +    vsrcCuc = vec_ld(off, s);                \
 +    if (loadSecond){                         \
 +        vsrcDuc = vec_ld(off + b, s);        \
 +    } else                                   \
 +        vsrcDuc = vsrcCuc;                   \
 +                                             \
 +    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
 +}
 +#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
 +    vec_u8 vsrcCuc, vsrcDuc;                         \
 +    vsrcCuc = vec_ld(off, s);                        \
 +    if (loadSecond){                                 \
 +        vsrcDuc = vec_ld(off + b, s);                \
 +    } else                                           \
 +        vsrcDuc = vsrcCuc;                           \
 +                                                     \
 +    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
 +    if (reallyBadAlign){                             \
 +        vs1 = vsrcDuc;                               \
 +    } else                                           \
 +        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
 + }
 +
 +#else
 +
 +#define GET_VSRC1(vs0, off, b, perm0, s){            \
 +    vs0 = vec_vsx_ld(off, s);                        \
 + }
 +#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
 +    vs0 = vec_vsx_ld(off, s);                        \
 +    vs1 = vec_vsx_ld(off + 1, s);                    \
 + }
 +#endif /* HAVE_BIGENDIAN */
 +
  #ifdef PREFIX_h264_chroma_mc8_altivec
  static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
-                                     int stride, int h, int x, int y) {
+                                            ptrdiff_t stride, int h,
+                                            int x, int y)
+ {
      DECLARE_ALIGNED(16, signed int, ABCD)[4] =
                          {((8 - x) * (8 - y)),
                           ((    x) * (8 - y)),
diff --cc libavcodec/x86/rv40dsp_init.c
index 218deb8,7bf3ecd..340173d
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@@ -32,27 -32,20 +32,27 @@@
  #include "libavutil/x86/cpu.h"
  #include "hpeldsp.h"
  
 +#define DEFINE_FN(op, size, insn) \
 +static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
 +                                               ptrdiff_t stride) \
 +{ \
 +    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
 +}
 +
  #if HAVE_YASM
  void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
  void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
-                                    int stride, int h, int x, int y);
+                                    ptrdiff_t stride, int h, int x, int y);
  void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
  
  void ff_put_rv40_chroma_mc4_mmx  (uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
  void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
-                                    int stride, int h, int x, int y);
+                                    ptrdiff_t stride, int h, int x, int y);
  void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
-                                   int stride, int h, int x, int y);
+                                   ptrdiff_t stride, int h, int x, int y);
  
  #define DECLARE_WEIGHT(opt) \
  void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
diff --cc libavcodec/x86/vc1dsp_init.c
index e05ae06,8982ff9..79d22a2
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@@ -83,23 -71,15 +83,23 @@@ DECLARE_FUNCTION(avg_, 16, _sse2
  #endif /* HAVE_YASM */
  
  void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y);
+                                        ptrdiff_t stride, int h, int x, int y);
  void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
-                                         int stride, int h, int x, int y);
+                                         ptrdiff_t stride, int h, int x, int y);
  void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y);
+                                        ptrdiff_t stride, int h, int x, int y);
  void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y);
+                                        ptrdiff_t stride, int h, int x, int y);
  void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
-                                        int stride, int h, int x, int y);
+                                        ptrdiff_t stride, int h, int x, int y);
 +void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
 +void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
 +void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
 +void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
  
  
  av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)