[FFmpeg-cvslog] Merge commit '2ec9fa5ec60dcd10e1cb10d8b4e4437e634ea428'

James Almer git at videolan.org
Tue Mar 21 20:07:38 EET 2017


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Mar 21 14:29:52 2017 -0300| [5a49097b42cbc3eab888d15a91eeaf5520b5c381] | committer: James Almer

Merge commit '2ec9fa5ec60dcd10e1cb10d8b4e4437e634ea428'

* commit '2ec9fa5ec60dcd10e1cb10d8b4e4437e634ea428':
  idct: Change type of array stride parameters to ptrdiff_t

Merged-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5a49097b42cbc3eab888d15a91eeaf5520b5c381
---

 doc/optimization.txt                 |  2 +-
 libavcodec/aarch64/idct.h            |  4 +--
 libavcodec/alpha/idctdsp_alpha.h     |  4 +--
 libavcodec/alpha/simple_idct_alpha.c |  4 +--
 libavcodec/arm/idct.h                | 13 +++++----
 libavcodec/arm/idctdsp_init_arm.c    | 13 ++++++---
 libavcodec/arm/simple_idct_armv6.S   |  4 +--
 libavcodec/arm/simple_idct_neon.S    |  4 +--
 libavcodec/arm/vc1dsp_init_neon.c    | 16 +++++------
 libavcodec/arm/vc1dsp_neon.S         |  6 ++--
 libavcodec/dct.h                     |  5 ++--
 libavcodec/dv.h                      |  2 +-
 libavcodec/dvdec.c                   |  2 +-
 libavcodec/faanidct.c                | 10 +++++--
 libavcodec/faanidct.h                |  5 ++--
 libavcodec/idctdsp.c                 | 12 ++++----
 libavcodec/idctdsp.h                 |  4 +--
 libavcodec/jrevdct.c                 |  4 +--
 libavcodec/mips/idctdsp_mips.h       | 12 ++++----
 libavcodec/mips/simple_idct_msa.c    |  4 +--
 libavcodec/mips/vc1dsp_mips.h        | 14 +++++-----
 libavcodec/mips/vc1dsp_mmi.c         | 14 +++++-----
 libavcodec/ppc/idctdsp.c             |  4 +--
 libavcodec/ppc/vc1dsp_altivec.c      |  3 +-
 libavcodec/simple_idct.c             | 12 ++++----
 libavcodec/simple_idct.h             | 21 +++++++-------
 libavcodec/simple_idct_template.c    |  8 +++---
 libavcodec/tests/dct.c               |  5 ++--
 libavcodec/vc1dsp.c                  | 54 ++++++++++++++++++------------------
 libavcodec/vc1dsp.h                  | 14 +++++-----
 libavcodec/wmv2dsp.c                 |  4 +--
 libavcodec/wmv2dsp.h                 |  4 +--
 libavcodec/x86/idctdsp.h             |  2 ++
 libavcodec/x86/simple_idct.c         |  4 +--
 libavcodec/x86/simple_idct.h         | 13 +++++----
 libavcodec/x86/vc1dsp_init.c         |  8 +++---
 libavcodec/x86/vc1dsp_mc.asm         |  2 +-
 libavcodec/x86/xvididct.h            | 13 +++++----
 libavcodec/x86/xvididct_init.c       |  8 +++---
 libavcodec/xvididct.c                |  4 +--
 40 files changed, 180 insertions(+), 161 deletions(-)

diff --git a/doc/optimization.txt b/doc/optimization.txt
index b4b0a06..c39e1e3 100644
--- a/doc/optimization.txt
+++ b/doc/optimization.txt
@@ -142,7 +142,7 @@ Alignment:
 Some instructions on some architectures have strict alignment restrictions,
 for example most SSE/SSE2 instructions on x86.
 The minimum guaranteed alignment is written in the .h files, for example:
-    void (*put_pixels_clamped)(const int16_t *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
+    void (*put_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, ptrdiff_t stride);
 
 
 General Tips:
diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h
index 05699c2..5c49046 100644
--- a/libavcodec/aarch64/idct.h
+++ b/libavcodec/aarch64/idct.h
@@ -22,7 +22,7 @@
 #include <stdint.h>
 
 void ff_simple_idct_neon(int16_t *data);
-void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
 
 #endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/libavcodec/alpha/idctdsp_alpha.h b/libavcodec/alpha/idctdsp_alpha.h
index bf98495..8cc969d 100644
--- a/libavcodec/alpha/idctdsp_alpha.h
+++ b/libavcodec/alpha/idctdsp_alpha.h
@@ -28,7 +28,7 @@ extern void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
                                         ptrdiff_t line_size);
 
 void ff_simple_idct_axp(int16_t *block);
-void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 #endif /* AVCODEC_ALPHA_IDCTDSP_ALPHA_H */
diff --git a/libavcodec/alpha/simple_idct_alpha.c b/libavcodec/alpha/simple_idct_alpha.c
index 04be0ce..6e377ef 100644
--- a/libavcodec/alpha/simple_idct_alpha.c
+++ b/libavcodec/alpha/simple_idct_alpha.c
@@ -290,13 +290,13 @@ void ff_simple_idct_axp(int16_t *block)
     }
 }
 
-void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_simple_idct_axp(block);
     put_pixels_clamped_axp_p(block, dest, line_size);
 }
 
-void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_simple_idct_axp(block);
     add_pixels_clamped_axp_p(block, dest, line_size);
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index 39cef3a..6c79a69 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -19,6 +19,7 @@
 #ifndef AVCODEC_ARM_IDCT_H
 #define AVCODEC_ARM_IDCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 void ff_j_rev_dct_arm(int16_t *data);
@@ -26,15 +27,15 @@ void ff_j_rev_dct_arm(int16_t *data);
 void ff_simple_idct_arm(int16_t *data);
 
 void ff_simple_idct_armv5te(int16_t *data);
-void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_put_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
 
 void ff_simple_idct_armv6(int16_t *data);
-void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
 
 void ff_simple_idct_neon(int16_t *data);
-void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
 
 #endif /* AVCODEC_ARM_IDCT_H */
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 0068e3f..43782b2 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
@@ -34,25 +35,29 @@ void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
 
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
  * converted */
-static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size,
+                              int16_t *block)
 {
     ff_j_rev_dct_arm(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size,
+                              int16_t *block)
 {
     ff_j_rev_dct_arm(block);
     ff_add_pixels_clamped(block, dest, line_size);
 }
 
-static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size,
+                                int16_t *block)
 {
     ff_simple_idct_arm(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size,
+                                int16_t *block)
 {
     ff_simple_idct_arm(block);
     ff_add_pixels_clamped(block, dest, line_size);
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index 79cf5d4..f95c20d 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -390,7 +390,7 @@ function ff_simple_idct_armv6, export=1
         pop    {r4-r11, pc}
 endfunc
 
-/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */
+/* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
 function ff_simple_idct_add_armv6, export=1
         push   {r0, r1, r4-r11, lr}
         sub    sp, sp, #128
@@ -407,7 +407,7 @@ function ff_simple_idct_add_armv6, export=1
         pop    {r4-r11, pc}
 endfunc
 
-/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */
+/* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
 function ff_simple_idct_put_armv6, export=1
         push   {r0, r1, r4-r11, lr}
         sub    sp, sp, #128
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index c3e573c..726d4cb 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -261,7 +261,7 @@ endconst
         pop             {r4-r7, pc}
         .endm
 
-/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */
+/* void ff_simple_idct_put_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */
 function ff_simple_idct_put_neon, export=1
         idct_start      r2
 
@@ -316,7 +316,7 @@ function idct_col4_add8_neon
         bx              lr
 endfunc
 
-/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */
+/* void ff_simple_idct_add_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */
 function ff_simple_idct_add_neon, export=1
         idct_start      r2
 
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index bb873e6..c340144 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -25,14 +25,14 @@
 #include "config.h"
 
 void ff_vc1_inv_trans_8x8_neon(int16_t *block);
-void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, int linesize, int16_t *block);
-
-void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index c4f4db9..611cbf2 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -426,7 +426,7 @@ function ff_vc1_inv_trans_8x8_neon, export=1
         bx              lr
 endfunc
 
-@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
+@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
 function ff_vc1_inv_trans_8x4_neon, export=1
         vld1.64         {q0-q1}, [r2,:128]!     @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
         vld1.64         {q2-q3}, [r2,:128]
@@ -511,7 +511,7 @@ function ff_vc1_inv_trans_8x4_neon, export=1
         bx              lr
 endfunc
 
-@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
+@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
 function ff_vc1_inv_trans_4x8_neon, export=1
         mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
         vld4.16         {d0[],  d2[],  d4[],  d6[]},  [r2,:64], r12     @ read each column into a q register
@@ -593,7 +593,7 @@ endfunc
         vshr.s16        q1,  q1,  #\rshift      @ dst[3,1] >>= rshift
 .endm
 
-@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
+@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
 function ff_vc1_inv_trans_4x4_neon, export=1
         mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
         vld4.16         {d0[],  d1[],  d2[],  d3[]},  [r2,:64], r12     @ read each column into a register
diff --git a/libavcodec/dct.h b/libavcodec/dct.h
index 05297ba..0a03e25 100644
--- a/libavcodec/dct.h
+++ b/libavcodec/dct.h
@@ -24,6 +24,7 @@
 #if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_DCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include "rdft.h"
@@ -62,7 +63,7 @@ void ff_j_rev_dct(int16_t *data);
 void ff_j_rev_dct4(int16_t *data);
 void ff_j_rev_dct2(int16_t *data);
 void ff_j_rev_dct1(int16_t *data);
-void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block);
+void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 #endif /* AVCODEC_DCT_H */
diff --git a/libavcodec/dv.h b/libavcodec/dv.h
index d5482d1..6350a16 100644
--- a/libavcodec/dv.h
+++ b/libavcodec/dv.h
@@ -47,7 +47,7 @@ typedef struct DVVideoContext {
 
     void (*get_pixels)(int16_t *block, const uint8_t *pixels, ptrdiff_t linesize);
     void (*fdct[2])(int16_t *block);
-    void (*idct_put[2])(uint8_t *dest, int line_size, int16_t *block);
+    void (*idct_put[2])(uint8_t *dest, ptrdiff_t stride, int16_t *block);
     me_cmp_func ildct_cmp;
     DVwork_chunk work_chunks[4 * 12 * 27];
     uint32_t idct_factor[2 * 4 * 16 * 64];
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 0b4c1bc..d71a660 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -54,7 +54,7 @@ typedef struct BlockInfo {
     const uint32_t *factor_table;
     const uint8_t *scan_table;
     uint8_t pos; /* position in block */
-    void (*idct_put)(uint8_t *dest, int line_size, int16_t *block);
+    void (*idct_put)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
     uint8_t partial_bit_count;
     uint32_t partial_bit_buffer;
     int shift_offset;
diff --git a/libavcodec/faanidct.c b/libavcodec/faanidct.c
index 2aab726..3921f82 100644
--- a/libavcodec/faanidct.c
+++ b/libavcodec/faanidct.c
@@ -47,7 +47,9 @@ B6*B0/8, B6*B1/8, B6*B2/8, B6*B3/8, B6*B4/8, B6*B5/8, B6*B6/8, B6*B7/8,
 B7*B0/8, B7*B1/8, B7*B2/8, B7*B3/8, B7*B4/8, B7*B5/8, B7*B6/8, B7*B7/8,
 };
 
-static inline void p8idct(int16_t data[64], FLOAT temp[64], uint8_t *dest, int stride, int x, int y, int type){
+static inline void p8idct(int16_t data[64], FLOAT temp[64], uint8_t *dest,
+                          ptrdiff_t stride, int x, int y, int type)
+{
     int i;
     FLOAT s04, d04, s17, d17, s26, d26, s53, d53;
     FLOAT os07, os16, os25, os34;
@@ -135,7 +137,8 @@ void ff_faanidct(int16_t block[64]){
     p8idct(block, temp, NULL, 0, 8, 1, 1);
 }
 
-void ff_faanidct_add(uint8_t *dest, int line_size, int16_t block[64]){
+void ff_faanidct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64])
+{
     FLOAT temp[64];
     int i;
 
@@ -148,7 +151,8 @@ void ff_faanidct_add(uint8_t *dest, int line_size, int16_t block[64]){
     p8idct(NULL , temp, dest, line_size, 8, 1, 2);
 }
 
-void ff_faanidct_put(uint8_t *dest, int line_size, int16_t block[64]){
+void ff_faanidct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64])
+{
     FLOAT temp[64];
     int i;
 
diff --git a/libavcodec/faanidct.h b/libavcodec/faanidct.h
index 4cd2c78..6f4da67 100644
--- a/libavcodec/faanidct.h
+++ b/libavcodec/faanidct.h
@@ -22,10 +22,11 @@
 #ifndef AVCODEC_FAANIDCT_H
 #define AVCODEC_FAANIDCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 void ff_faanidct(int16_t block[64]);
-void ff_faanidct_add(uint8_t *dest, int line_size, int16_t block[64]);
-void ff_faanidct_put(uint8_t *dest, int line_size, int16_t block[64]);
+void ff_faanidct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]);
+void ff_faanidct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]);
 
 #endif /* AVCODEC_FAANIDCT_H */
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 37f4640..84dd645 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -207,33 +207,33 @@ static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pix
     }
 }
 
-static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct4_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct4 (block);
     put_pixels_clamped4_c(block, dest, line_size);
 }
-static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct4_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct4 (block);
     add_pixels_clamped4_c(block, dest, line_size);
 }
 
-static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct2_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct2 (block);
     put_pixels_clamped2_c(block, dest, line_size);
 }
-static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct2_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct2 (block);
     add_pixels_clamped2_c(block, dest, line_size);
 }
 
-static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct1_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     dest[0] = av_clip_uint8((block[0] + 4)>>3);
 }
-static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
+static void ff_jref_idct1_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
 }
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index e449be3..abfa587 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -68,14 +68,14 @@ typedef struct IDCTDSPContext {
      * @param line_size size in bytes of a horizontal line of dest
      */
     void (*idct_put)(uint8_t *dest /* align 8 */,
-                     int line_size, int16_t *block /* align 16 */);
+                     ptrdiff_t line_size, int16_t *block /* align 16 */);
 
     /**
      * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
      * @param line_size size in bytes of a horizontal line of dest
      */
     void (*idct_add)(uint8_t *dest /* align 8 */,
-                     int line_size, int16_t *block /* align 16 */);
+                     ptrdiff_t line_size, int16_t *block /* align 16 */);
 
     /**
      * IDCT input permutation.
diff --git a/libavcodec/jrevdct.c b/libavcodec/jrevdct.c
index 55a7392..89dd9f2 100644
--- a/libavcodec/jrevdct.c
+++ b/libavcodec/jrevdct.c
@@ -1156,13 +1156,13 @@ void ff_j_rev_dct1(DCTBLOCK data){
 #undef FIX
 #undef CONST_BITS
 
-void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
+void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
+void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
     ff_add_pixels_clamped(block, dest, line_size);
diff --git a/libavcodec/mips/idctdsp_mips.h b/libavcodec/mips/idctdsp_mips.h
index 19267e6..7ca7c1c 100644
--- a/libavcodec/mips/idctdsp_mips.h
+++ b/libavcodec/mips/idctdsp_mips.h
@@ -34,11 +34,11 @@ void ff_add_pixels_clamped_msa(const int16_t *block,
                                uint8_t *av_restrict pixels,
                                ptrdiff_t line_size);
 void ff_j_rev_dct_msa(int16_t *data);
-void ff_jref_idct_put_msa(uint8_t *dest, int32_t stride, int16_t *block);
-void ff_jref_idct_add_msa(uint8_t *dest, int32_t stride, int16_t *block);
+void ff_jref_idct_put_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_jref_idct_add_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_simple_idct_msa(int16_t *block);
-void ff_simple_idct_put_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
-void ff_simple_idct_add_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
+void ff_simple_idct_put_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_simple_idct_add_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
 
 void ff_put_pixels_clamped_mmi(const int16_t *block,
         uint8_t *av_restrict pixels, ptrdiff_t line_size);
@@ -47,7 +47,7 @@ void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
 void ff_add_pixels_clamped_mmi(const int16_t *block,
         uint8_t *av_restrict pixels, ptrdiff_t line_size);
 void ff_simple_idct_mmi(int16_t *block);
-void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
-void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_simple_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 #endif  // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
index bd8b310..8a72359 100644
--- a/libavcodec/mips/simple_idct_msa.c
+++ b/libavcodec/mips/simple_idct_msa.c
@@ -562,12 +562,12 @@ void ff_simple_idct_msa(int16_t *block)
     simple_idct_msa(block);
 }
 
-void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
 {
     simple_idct_put_msa(dst, dst_stride, block);
 }
 
-void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
 {
     simple_idct_add_msa(dst, dst_stride, block);
 }
diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h
index 2cdd900..b9b07e1 100644
--- a/libavcodec/mips/vc1dsp_mips.h
+++ b/libavcodec/mips/vc1dsp_mips.h
@@ -157,14 +157,14 @@ void ff_avg_vc1_mspel_mc33_16_mmi(uint8_t *dst, const uint8_t *src,
                                   ptrdiff_t stride, int rnd);
 
 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]);
-void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, int linesize, int16_t *block);
+void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
 
-void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
-void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
 
 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride);
 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride);
diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
index dfae2d9..01e7f9f 100644
--- a/libavcodec/mips/vc1dsp_mmi.c
+++ b/libavcodec/mips/vc1dsp_mmi.c
@@ -141,7 +141,7 @@
 
 
 /* Do inverse transform on 8x8 block */
-void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int dc = block[0];
     double ftmp[9];
@@ -368,7 +368,7 @@ void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 #endif
 
 /* Do inverse transform on 8x4 part of block */
-void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int dc = block[0];
     double ftmp[9];
@@ -425,7 +425,7 @@ void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
 }
 
 #if _MIPS_SIM != _ABIO32
-void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int16_t *src = block;
     int16_t *dst = block;
@@ -599,7 +599,7 @@ void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, int linesize, int16_t *block)
 #endif
 
 /* Do inverse transform on 4x8 parts of block */
-void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int dc = block[0];
     double ftmp[9];
@@ -672,7 +672,7 @@ void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
 }
 
 #if _MIPS_SIM != _ABIO32
-void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int16_t *src = block;
     int16_t *dst = block;
@@ -838,7 +838,7 @@ void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, int linesize, int16_t *block)
 #endif
 
 /* Do inverse transform on 4x4 part of block */
-void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int dc = block[0];
     double ftmp[5];
@@ -886,7 +886,7 @@ void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
     );
 }
 
-void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, int linesize, int16_t *block)
+void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     int16_t *src = block;
     int16_t *dst = block;
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index 80e71fd..f1b4247 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -169,7 +169,7 @@ static void idct_altivec(int16_t *blk)
     block[7] = vx7;
 }
 
-static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
+static void idct_put_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
 {
     vec_s16 *block = (vec_s16 *) blk;
     vec_u8 tmp;
@@ -198,7 +198,7 @@ static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
     COPY(dest, vx7);
 }
 
-static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
+static void idct_add_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
 {
     vec_s16 *block = (vec_s16 *) blk;
     vec_u8 tmp;
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index 35bb280..83d537f 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -229,7 +229,8 @@ static void vc1_inv_trans_8x8_altivec(int16_t block[64])
 
 /** Do inverse transform on 8x4 part of block
 */
-static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
+static void vc1_inv_trans_8x4_altivec(uint8_t *dest, ptrdiff_t stride,
+                                      int16_t *block)
 {
     vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
     vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c
index 65b2911..1d05b2f 100644
--- a/libavcodec/simple_idct.c
+++ b/libavcodec/simple_idct.c
@@ -58,7 +58,7 @@
    and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
 #define C_SHIFT (4+1+12)
 
-static inline void idct4col_put(uint8_t *dest, int line_size, const int16_t *col)
+static inline void idct4col_put(uint8_t *dest, ptrdiff_t line_size, const int16_t *col)
 {
     int c0, c1, c2, c3, a0, a1, a2, a3;
 
@@ -94,7 +94,7 @@ static inline void idct4col_put(uint8_t *dest, int line_size, const int16_t *col
 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
    compensate the extra butterfly stage - I don't have the full DV
    specification */
-void ff_simple_idct248_put(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
     int16_t *ptr;
@@ -137,7 +137,7 @@ void ff_simple_idct248_put(uint8_t *dest, int line_size, int16_t *block)
 #define C2 C_FIX(0.2705980501)
 #define C3 C_FIX(0.5)
 #define C_SHIFT (4+1+12)
-static inline void idct4col_add(uint8_t *dest, int line_size, const int16_t *col)
+static inline void idct4col_add(uint8_t *dest, ptrdiff_t line_size, const int16_t *col)
 {
     int c0, c1, c2, c3, a0, a1, a2, a3;
 
@@ -182,7 +182,7 @@ static inline void idct4row(int16_t *row)
     row[3]= (c0 - c1) >> R_SHIFT;
 }
 
-void ff_simple_idct84_add(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct84_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
 
@@ -197,7 +197,7 @@ void ff_simple_idct84_add(uint8_t *dest, int line_size, int16_t *block)
     }
 }
 
-void ff_simple_idct48_add(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct48_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
 
@@ -212,7 +212,7 @@ void ff_simple_idct48_add(uint8_t *dest, int line_size, int16_t *block)
     }
 }
 
-void ff_simple_idct44_add(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct44_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
 
diff --git a/libavcodec/simple_idct.h b/libavcodec/simple_idct.h
index 154e297..2a5e1d7 100644
--- a/libavcodec/simple_idct.h
+++ b/libavcodec/simple_idct.h
@@ -28,18 +28,19 @@
 #ifndef AVCODEC_SIMPLE_IDCT_H
 #define AVCODEC_SIMPLE_IDCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
-void ff_simple_idct_put_8(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_add_8(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_8(int16_t *block);
 
-void ff_simple_idct_put_10(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_add_10(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_10(int16_t *block);
 
-void ff_simple_idct_put_12(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_add_12(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_12(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_12(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_12(int16_t *block);
 
 /**
@@ -49,10 +50,10 @@ void ff_simple_idct_12(int16_t *block);
  */
 void ff_prores_idct(int16_t *block, const int16_t *qmat);
 
-void ff_simple_idct248_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
-void ff_simple_idct84_add(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct48_add(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct44_add(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct84_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct48_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct44_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 #endif /* AVCODEC_SIMPLE_IDCT_H */
diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index c669767..f532313 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -250,7 +250,7 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
 #ifdef EXTRA_SHIFT
 static inline void FUNC(idctSparseCol_extrashift)(int16_t *col)
 #else
-static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size,
+static inline void FUNC(idctSparseColPut)(pixel *dest, ptrdiff_t line_size,
                                           int16_t *col)
 {
     SUINT a0, a1, a2, a3, b0, b1, b2, b3;
@@ -274,7 +274,7 @@ static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size,
     dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT);
 }
 
-static inline void FUNC(idctSparseColAdd)(pixel *dest, int line_size,
+static inline void FUNC(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
                                           int16_t *col)
 {
     int a0, a1, a2, a3, b0, b1, b2, b3;
@@ -316,7 +316,7 @@ static inline void FUNC(idctSparseCol)(int16_t *col)
 }
 
 #ifndef EXTRA_SHIFT
-void FUNC(ff_simple_idct_put)(uint8_t *dest_, int line_size, int16_t *block)
+void FUNC(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
 {
     pixel *dest = (pixel *)dest_;
     int i;
@@ -330,7 +330,7 @@ void FUNC(ff_simple_idct_put)(uint8_t *dest_, int line_size, int16_t *block)
         FUNC(idctSparseColPut)(dest + i, line_size, block + i);
 }
 
-void FUNC(ff_simple_idct_add)(uint8_t *dest_, int line_size, int16_t *block)
+void FUNC(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
 {
     pixel *dest = (pixel *)dest_;
     int i;
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 4f0e0d9..29af3fe 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -284,7 +284,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed, c
 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 
-static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
+static void idct248_ref(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 {
     static int init;
     static double c8[8][8];
@@ -365,7 +365,8 @@ static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 }
 
 static void idct248_error(const char *name,
-                          void (*idct248_put)(uint8_t *dest, int line_size,
+                          void (*idct248_put)(uint8_t *dest,
+                                              ptrdiff_t line_size,
                                               int16_t *block),
                           int speed)
 {
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 8f48f0c..eaadebe 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -238,7 +238,7 @@ static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
 }
 
 /* Do inverse transform on 8x8 block */
-static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     int dc = block[0];
@@ -255,7 +255,7 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
         dest[5] = av_clip_uint8(dest[5] + dc);
         dest[6] = av_clip_uint8(dest[6] + dc);
         dest[7] = av_clip_uint8(dest[7] + dc);
-        dest += linesize;
+        dest += stride;
     }
 }
 
@@ -329,7 +329,7 @@ static void vc1_inv_trans_8x8_c(int16_t block[64])
 }
 
 /* Do inverse transform on 8x4 part of block */
-static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     int dc = block[0];
@@ -346,11 +346,11 @@ static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
         dest[5] = av_clip_uint8(dest[5] + dc);
         dest[6] = av_clip_uint8(dest[6] + dc);
         dest[7] = av_clip_uint8(dest[7] + dc);
-        dest += linesize;
+        dest += stride;
     }
 }
 
-static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_8x4_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     register int t1, t2, t3, t4, t5, t6, t7, t8;
@@ -395,10 +395,10 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, int16_t *block)
         t3 = 22 * src[ 8] + 10 * src[24];
         t4 = 22 * src[24] - 10 * src[ 8];
 
-        dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t1 + t3) >> 7));
-        dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t2 - t4) >> 7));
-        dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t2 + t4) >> 7));
-        dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t1 - t3) >> 7));
+        dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t1 + t3) >> 7));
+        dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t2 - t4) >> 7));
+        dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t2 + t4) >> 7));
+        dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t1 - t3) >> 7));
 
         src++;
         dest++;
@@ -406,7 +406,7 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, int16_t *block)
 }
 
 /* Do inverse transform on 4x8 parts of block */
-static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     int dc = block[0];
@@ -419,11 +419,11 @@ static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
         dest[1] = av_clip_uint8(dest[1] + dc);
         dest[2] = av_clip_uint8(dest[2] + dc);
         dest[3] = av_clip_uint8(dest[3] + dc);
-        dest += linesize;
+        dest += stride;
     }
 }
 
-static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_4x8_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     register int t1, t2, t3, t4, t5, t6, t7, t8;
@@ -464,14 +464,14 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, int16_t *block)
         t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
         t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];
 
-        dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t5 + t1)     >> 7));
-        dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t6 + t2)     >> 7));
-        dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t7 + t3)     >> 7));
-        dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t8 + t4)     >> 7));
-        dest[4 * linesize] = av_clip_uint8(dest[4 * linesize] + ((t8 - t4 + 1) >> 7));
-        dest[5 * linesize] = av_clip_uint8(dest[5 * linesize] + ((t7 - t3 + 1) >> 7));
-        dest[6 * linesize] = av_clip_uint8(dest[6 * linesize] + ((t6 - t2 + 1) >> 7));
-        dest[7 * linesize] = av_clip_uint8(dest[7 * linesize] + ((t5 - t1 + 1) >> 7));
+        dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t5 + t1)     >> 7));
+        dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t6 + t2)     >> 7));
+        dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t7 + t3)     >> 7));
+        dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t8 + t4)     >> 7));
+        dest[4 * stride] = av_clip_uint8(dest[4 * stride] + ((t8 - t4 + 1) >> 7));
+        dest[5 * stride] = av_clip_uint8(dest[5 * stride] + ((t7 - t3 + 1) >> 7));
+        dest[6 * stride] = av_clip_uint8(dest[6 * stride] + ((t6 - t2 + 1) >> 7));
+        dest[7 * stride] = av_clip_uint8(dest[7 * stride] + ((t5 - t1 + 1) >> 7));
 
         src++;
         dest++;
@@ -479,7 +479,7 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, int16_t *block)
 }
 
 /* Do inverse transform on 4x4 part of block */
-static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     int dc = block[0];
@@ -492,11 +492,11 @@ static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
         dest[1] = av_clip_uint8(dest[1] + dc);
         dest[2] = av_clip_uint8(dest[2] + dc);
         dest[3] = av_clip_uint8(dest[3] + dc);
-        dest += linesize;
+        dest += stride;
     }
 }
 
-static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, int16_t *block)
+static void vc1_inv_trans_4x4_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 {
     int i;
     register int t1, t2, t3, t4;
@@ -526,10 +526,10 @@ static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, int16_t *block)
         t3 = 22 * src[8] + 10 * src[24];
         t4 = 22 * src[24] - 10 * src[8];
 
-        dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t1 + t3) >> 7));
-        dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t2 - t4) >> 7));
-        dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t2 + t4) >> 7));
-        dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t1 - t3) >> 7));
+        dest[0 * stride] = av_clip_uint8(dest[0 * stride] + ((t1 + t3) >> 7));
+        dest[1 * stride] = av_clip_uint8(dest[1 * stride] + ((t2 - t4) >> 7));
+        dest[2 * stride] = av_clip_uint8(dest[2 * stride] + ((t2 + t4) >> 7));
+        dest[3 * stride] = av_clip_uint8(dest[3 * stride] + ((t1 - t3) >> 7));
 
         src++;
         dest++;
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index 9543070..16b3528 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -35,13 +35,13 @@ typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const
 typedef struct VC1DSPContext {
     /* vc1 functions */
     void (*vc1_inv_trans_8x8)(int16_t *b);
-    void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, int16_t *block);
-    void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, int16_t *block);
+    void (*vc1_inv_trans_8x4)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_4x8)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_4x4)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+    void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
     void (*vc1_v_overlap)(uint8_t *src, int stride);
     void (*vc1_h_overlap)(uint8_t *src, int stride);
     void (*vc1_v_s_overlap)(int16_t *top,  int16_t *bottom);
diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c
index b9e98d5..162ac92 100644
--- a/libavcodec/wmv2dsp.c
+++ b/libavcodec/wmv2dsp.c
@@ -93,7 +93,7 @@ static void wmv2_idct_col(short * b)
     b[8 * 7] = (a0 + a2 - a1 - a5 + (1 << 13)) >> 14;
 }
 
-static void wmv2_idct_add_c(uint8_t *dest, int line_size, int16_t *block)
+static void wmv2_idct_add_c(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
 
@@ -116,7 +116,7 @@ static void wmv2_idct_add_c(uint8_t *dest, int line_size, int16_t *block)
     }
 }
 
-static void wmv2_idct_put_c(uint8_t *dest, int line_size, int16_t *block)
+static void wmv2_idct_put_c(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     int i;
 
diff --git a/libavcodec/wmv2dsp.h b/libavcodec/wmv2dsp.h
index 62a990d..5e40b30 100644
--- a/libavcodec/wmv2dsp.h
+++ b/libavcodec/wmv2dsp.h
@@ -24,8 +24,8 @@
 #include "qpeldsp.h"
 
 typedef struct WMV2DSPContext {
-    void (*idct_add)(uint8_t *dest, int line_size, int16_t *block);
-    void (*idct_put)(uint8_t *dest, int line_size, int16_t *block);
+    void (*idct_add)(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+    void (*idct_put)(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
     qpel_mc_func put_mspel_pixels_tab[8];
 
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index daa4e79..e2e296a 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -19,6 +19,7 @@
 #ifndef AVCODEC_X86_IDCTDSP_H
 #define AVCODEC_X86_IDCTDSP_H
 
+#include <stddef.h>
 #include <stdint.h>
 #include <stddef.h>
 
@@ -35,4 +36,5 @@ void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
 void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
                                        ptrdiff_t line_size);
 
+
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 333d6e1..d3a19fa 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -904,12 +904,12 @@ void ff_simple_idct_mmx(int16_t *block)
 
 //FIXME merge add/put into the idct
 
-void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     idct(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
-void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
+void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     idct(block);
     ff_add_pixels_clamped(block, dest, line_size);
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 8eeb31e..ad76baf 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -19,22 +19,23 @@
 #ifndef AVCODEC_X86_SIMPLE_IDCT_H
 #define AVCODEC_X86_SIMPLE_IDCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
-void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_simple_idct12_sse2(int16_t *block);
 void ff_simple_idct12_avx(int16_t *block);
 
-void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block);
-void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index c8943fa..e05ae06 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -92,13 +92,13 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        int stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        int stride, int h, int x, int y);
-void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
-void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
-void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
-void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
 
 
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 175c397..2850ca8 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -225,7 +225,7 @@ HOR_16B_SHIFT2 OP_AVG, avg
     mov%1 [linesize3q +destq], m5
 %endmacro
 
-; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 INIT_MMX mmxext
 cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
     movsx         r3d, WORD [blockq]
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 573b25c..edb5ebf 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -26,18 +26,19 @@
 #ifndef AVCODEC_X86_XVIDIDCT_H
 #define AVCODEC_X86_XVIDIDCT_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 void ff_xvid_idct_mmx(short *block);
-void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block);
+void ff_xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_xvid_idct_mmxext(short *block);
-void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
+void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
-void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
 
 #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index 8b9d8de..fd10953 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -27,25 +27,25 @@
 #include "xvididct.h"
 
 #if ARCH_X86_32 && HAVE_YASM
-static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
+static void xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, short *block)
 {
     ff_xvid_idct_mmx(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-static void xvid_idct_mmx_add(uint8_t *dest, int line_size, short *block)
+static void xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, short *block)
 {
     ff_xvid_idct_mmx(block);
     ff_add_pixels_clamped(block, dest, line_size);
 }
 
-static void xvid_idct_mmxext_put(uint8_t *dest, int line_size, short *block)
+static void xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, short *block)
 {
     ff_xvid_idct_mmxext(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-static void xvid_idct_mmxext_add(uint8_t *dest, int line_size, short *block)
+static void xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, short *block)
 {
     ff_xvid_idct_mmxext(block);
     ff_add_pixels_clamped(block, dest, line_size);
diff --git a/libavcodec/xvididct.c b/libavcodec/xvididct.c
index 1f96ccc..4642a30 100644
--- a/libavcodec/xvididct.c
+++ b/libavcodec/xvididct.c
@@ -318,13 +318,13 @@ void ff_xvid_idct(int16_t *const in)
     }
 }
 
-static void xvid_idct_put(uint8_t *dest, int line_size, int16_t *block)
+static void xvid_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_xvid_idct(block);
     ff_put_pixels_clamped(block, dest, line_size);
 }
 
-static void xvid_idct_add(uint8_t *dest, int line_size, int16_t *block)
+static void xvid_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_xvid_idct(block);
     ff_add_pixels_clamped(block, dest, line_size);


======================================================================

diff --cc libavcodec/aarch64/idct.h
index 05699c2,0000000..5c49046
mode 100644,000000..100644
--- a/libavcodec/aarch64/idct.h
+++ b/libavcodec/aarch64/idct.h
@@@ -1,28 -1,0 +1,28 @@@
 +/*
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_AARCH64_IDCT_H
 +#define AVCODEC_AARCH64_IDCT_H
 +
 +#include <stdint.h>
 +
 +void ff_simple_idct_neon(int16_t *data);
- void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
- void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
++void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
++void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
 +
 +#endif /* AVCODEC_AARCH64_IDCT_H */
diff --cc libavcodec/alpha/idctdsp_alpha.h
index bf98495,0000000..8cc969d
mode 100644,000000..100644
--- a/libavcodec/alpha/idctdsp_alpha.h
+++ b/libavcodec/alpha/idctdsp_alpha.h
@@@ -1,34 -1,0 +1,34 @@@
 +/*
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_ALPHA_IDCTDSP_ALPHA_H
 +#define AVCODEC_ALPHA_IDCTDSP_ALPHA_H
 +
 +#include <stddef.h>
 +#include <stdint.h>
 +
 +extern void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
 +                                        ptrdiff_t line_size);
 +extern void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
 +                                        ptrdiff_t line_size);
 +
 +void ff_simple_idct_axp(int16_t *block);
- void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
++void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
++void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 +
 +#endif /* AVCODEC_ALPHA_IDCTDSP_ALPHA_H */
diff --cc libavcodec/alpha/simple_idct_alpha.c
index 04be0ce,0000000..6e377ef
mode 100644,000000..100644
--- a/libavcodec/alpha/simple_idct_alpha.c
+++ b/libavcodec/alpha/simple_idct_alpha.c
@@@ -1,303 -1,0 +1,303 @@@
 +/*
 + * Simple IDCT (Alpha optimized)
 + *
 + * Copyright (c) 2001 Michael Niedermayer <michaelni at gmx.at>
 + *
 + * based upon some outcommented C code from mpeg2dec (idct_mmx.c
 + * written by Aaron Holtzman <aholtzma at ess.engr.uvic.ca>)
 + *
 + * Alpha optimizations by Måns Rullgård <mans at mansr.com>
 + *                     and Falk Hueffner <falk at debian.org>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "idctdsp_alpha.h"
 +#include "asm.h"
 +
 +// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
 +// W4 is actually exactly 16384, but using 16383 works around
 +// accumulating rounding errors for some encoders
 +#define W1 22725
 +#define W2 21407
 +#define W3 19266
 +#define W4 16383
 +#define W5 12873
 +#define W6  8867
 +#define W7  4520
 +#define ROW_SHIFT 11
 +#define COL_SHIFT 20
 +
 +/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
 +static inline int idct_row(int16_t *row)
 +{
 +    int a0, a1, a2, a3, b0, b1, b2, b3, t;
 +    uint64_t l, r, t2;
 +    l = ldq(row);
 +    r = ldq(row + 4);
 +
 +    if (l == 0 && r == 0)
 +        return 0;
 +
 +    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
 +
 +    if (((l & ~0xffffUL) | r) == 0) {
 +        a0 >>= ROW_SHIFT;
 +        t2 = (uint16_t) a0;
 +        t2 |= t2 << 16;
 +        t2 |= t2 << 32;
 +
 +        stq(t2, row);
 +        stq(t2, row + 4);
 +        return 1;
 +    }
 +
 +    a1 = a0;
 +    a2 = a0;
 +    a3 = a0;
 +
 +    t = extwl(l, 4);            /* row[2] */
 +    if (t != 0) {
 +        t = sextw(t);
 +        a0 += W2 * t;
 +        a1 += W6 * t;
 +        a2 -= W6 * t;
 +        a3 -= W2 * t;
 +    }
 +
 +    t = extwl(r, 0);            /* row[4] */
 +    if (t != 0) {
 +        t = sextw(t);
 +        a0 += W4 * t;
 +        a1 -= W4 * t;
 +        a2 -= W4 * t;
 +        a3 += W4 * t;
 +    }
 +
 +    t = extwl(r, 4);            /* row[6] */
 +    if (t != 0) {
 +        t = sextw(t);
 +        a0 += W6 * t;
 +        a1 -= W2 * t;
 +        a2 += W2 * t;
 +        a3 -= W6 * t;
 +    }
 +
 +    t = extwl(l, 2);            /* row[1] */
 +    if (t != 0) {
 +        t = sextw(t);
 +        b0 = W1 * t;
 +        b1 = W3 * t;
 +        b2 = W5 * t;
 +        b3 = W7 * t;
 +    } else {
 +        b0 = 0;
 +        b1 = 0;
 +        b2 = 0;
 +        b3 = 0;
 +    }
 +
 +    t = extwl(l, 6);            /* row[3] */
 +    if (t) {
 +        t = sextw(t);
 +        b0 += W3 * t;
 +        b1 -= W7 * t;
 +        b2 -= W1 * t;
 +        b3 -= W5 * t;
 +    }
 +
 +
 +    t = extwl(r, 2);            /* row[5] */
 +    if (t) {
 +        t = sextw(t);
 +        b0 += W5 * t;
 +        b1 -= W1 * t;
 +        b2 += W7 * t;
 +        b3 += W3 * t;
 +    }
 +
 +    t = extwl(r, 6);            /* row[7] */
 +    if (t) {
 +        t = sextw(t);
 +        b0 += W7 * t;
 +        b1 -= W5 * t;
 +        b2 += W3 * t;
 +        b3 -= W1 * t;
 +    }
 +
 +    row[0] = (a0 + b0) >> ROW_SHIFT;
 +    row[1] = (a1 + b1) >> ROW_SHIFT;
 +    row[2] = (a2 + b2) >> ROW_SHIFT;
 +    row[3] = (a3 + b3) >> ROW_SHIFT;
 +    row[4] = (a3 - b3) >> ROW_SHIFT;
 +    row[5] = (a2 - b2) >> ROW_SHIFT;
 +    row[6] = (a1 - b1) >> ROW_SHIFT;
 +    row[7] = (a0 - b0) >> ROW_SHIFT;
 +
 +    return 2;
 +}
 +
 +static inline void idct_col(int16_t *col)
 +{
 +    int a0, a1, a2, a3, b0, b1, b2, b3;
 +
 +    col[0] += (1 << (COL_SHIFT - 1)) / W4;
 +
 +    a0 = W4 * col[8 * 0];
 +    a1 = W4 * col[8 * 0];
 +    a2 = W4 * col[8 * 0];
 +    a3 = W4 * col[8 * 0];
 +
 +    if (col[8 * 2]) {
 +        a0 += W2 * col[8 * 2];
 +        a1 += W6 * col[8 * 2];
 +        a2 -= W6 * col[8 * 2];
 +        a3 -= W2 * col[8 * 2];
 +    }
 +
 +    if (col[8 * 4]) {
 +        a0 += W4 * col[8 * 4];
 +        a1 -= W4 * col[8 * 4];
 +        a2 -= W4 * col[8 * 4];
 +        a3 += W4 * col[8 * 4];
 +    }
 +
 +    if (col[8 * 6]) {
 +        a0 += W6 * col[8 * 6];
 +        a1 -= W2 * col[8 * 6];
 +        a2 += W2 * col[8 * 6];
 +        a3 -= W6 * col[8 * 6];
 +    }
 +
 +    if (col[8 * 1]) {
 +        b0 = W1 * col[8 * 1];
 +        b1 = W3 * col[8 * 1];
 +        b2 = W5 * col[8 * 1];
 +        b3 = W7 * col[8 * 1];
 +    } else {
 +        b0 = 0;
 +        b1 = 0;
 +        b2 = 0;
 +        b3 = 0;
 +    }
 +
 +    if (col[8 * 3]) {
 +        b0 += W3 * col[8 * 3];
 +        b1 -= W7 * col[8 * 3];
 +        b2 -= W1 * col[8 * 3];
 +        b3 -= W5 * col[8 * 3];
 +    }
 +
 +    if (col[8 * 5]) {
 +        b0 += W5 * col[8 * 5];
 +        b1 -= W1 * col[8 * 5];
 +        b2 += W7 * col[8 * 5];
 +        b3 += W3 * col[8 * 5];
 +    }
 +
 +    if (col[8 * 7]) {
 +        b0 += W7 * col[8 * 7];
 +        b1 -= W5 * col[8 * 7];
 +        b2 += W3 * col[8 * 7];
 +        b3 -= W1 * col[8 * 7];
 +    }
 +
 +    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
 +    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
 +    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
 +    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
 +    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
 +    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
 +    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
 +    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
 +}
 +
 +/* If all rows but the first one are zero after row transformation,
 +   all rows will be identical after column transformation.  */
 +static inline void idct_col2(int16_t *col)
 +{
 +    int i;
 +    uint64_t l, r;
 +
 +    for (i = 0; i < 8; ++i) {
 +        int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
 +
 +        a0 *= W4;
 +        col[i] = a0 >> COL_SHIFT;
 +    }
 +
 +    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
 +    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
 +    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
 +    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
 +    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
 +    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
 +    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
 +    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
 +}
 +
 +void ff_simple_idct_axp(int16_t *block)
 +{
 +
 +    int i;
 +    int rowsZero = 1;           /* all rows except row 0 zero */
 +    int rowsConstant = 1;       /* all rows consist of a constant value */
 +
 +    for (i = 0; i < 8; i++) {
 +        int sparseness = idct_row(block + 8 * i);
 +
 +        if (i > 0 && sparseness > 0)
 +            rowsZero = 0;
 +        if (sparseness == 2)
 +            rowsConstant = 0;
 +    }
 +
 +    if (rowsZero) {
 +        idct_col2(block);
 +    } else if (rowsConstant) {
 +        idct_col(block);
 +        for (i = 0; i < 8; i += 2) {
 +            uint64_t v = (uint16_t) block[0];
 +            uint64_t w = (uint16_t) block[8];
 +
 +            v |= v << 16;
 +            w |= w << 16;
 +            v |= v << 32;
 +            w |= w << 32;
 +            stq(v, block + 0 * 4);
 +            stq(v, block + 1 * 4);
 +            stq(w, block + 2 * 4);
 +            stq(w, block + 3 * 4);
 +            block += 4 * 4;
 +        }
 +    } else {
 +        for (i = 0; i < 8; i++)
 +            idct_col(block + i);
 +    }
 +}
 +
- void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
++void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_simple_idct_axp(block);
 +    put_pixels_clamped_axp_p(block, dest, line_size);
 +}
 +
- void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
++void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_simple_idct_axp(block);
 +    add_pixels_clamped_axp_p(block, dest, line_size);
 +}
diff --cc libavcodec/dct.h
index 05297ba,46893a6..0a03e25
--- a/libavcodec/dct.h
+++ b/libavcodec/dct.h
@@@ -21,9 -21,10 +21,10 @@@
   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  
 -#ifndef AVCODEC_DCT_H
 +#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
  #define AVCODEC_DCT_H
  
+ #include <stddef.h>
  #include <stdint.h>
  
  #include "rdft.h"
@@@ -59,10 -60,7 +60,10 @@@ void ff_fdct248_islow_8(int16_t *data)
  void ff_fdct248_islow_10(int16_t *data);
  
  void ff_j_rev_dct(int16_t *data);
 +void ff_j_rev_dct4(int16_t *data);
 +void ff_j_rev_dct2(int16_t *data);
 +void ff_j_rev_dct1(int16_t *data);
- void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block);
- void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block);
+ void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  
  #endif /* AVCODEC_DCT_H */
diff --cc libavcodec/idctdsp.c
index 37f4640,5a267e4..84dd645
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@@ -177,67 -144,6 +177,67 @@@ static void add_pixels_clamped_c(const 
      }
  }
  
 +static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
 +                          int line_size)
 +{
 +    int i;
 +
 +    /* read the pixels */
 +    for(i=0;i<4;i++) {
 +        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 +        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 +        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 +        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 +        pixels += line_size;
 +        block += 8;
 +    }
 +}
 +
 +static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
 +                          int line_size)
 +{
 +    int i;
 +
 +    /* read the pixels */
 +    for(i=0;i<2;i++) {
 +        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 +        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 +        pixels += line_size;
 +        block += 8;
 +    }
 +}
 +
- static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct4_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_j_rev_dct4 (block);
 +    put_pixels_clamped4_c(block, dest, line_size);
 +}
- static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct4_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_j_rev_dct4 (block);
 +    add_pixels_clamped4_c(block, dest, line_size);
 +}
 +
- static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct2_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_j_rev_dct2 (block);
 +    put_pixels_clamped2_c(block, dest, line_size);
 +}
- static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct2_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    ff_j_rev_dct2 (block);
 +    add_pixels_clamped2_c(block, dest, line_size);
 +}
 +
- static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct1_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    dest[0] = av_clip_uint8((block[0] + 4)>>3);
 +}
- static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
++static void ff_jref_idct1_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 +{
 +    dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
 +}
 +
  av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
  {
      const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
diff --cc libavcodec/jrevdct.c
index 55a7392,808f583..89dd9f2
--- a/libavcodec/jrevdct.c
+++ b/libavcodec/jrevdct.c
@@@ -943,220 -943,7 +943,220 @@@ void ff_j_rev_dct(DCTBLOCK data
    }
  }
  
 +#undef DCTSIZE
 +#define DCTSIZE 4
 +#define DCTSTRIDE 8
 +
 +void ff_j_rev_dct4(DCTBLOCK data)
 +{
 +  int32_t tmp0, tmp1, tmp2, tmp3;
 +  int32_t tmp10, tmp11, tmp12, tmp13;
 +  int32_t z1;
 +  int32_t d0, d2, d4, d6;
 +  register int16_t *dataptr;
 +  int rowctr;
 +
 +  /* Pass 1: process rows. */
 +  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
 +  /* furthermore, we scale the results by 2**PASS1_BITS. */
 +
 +  data[0] += 4;
 +
 +  dataptr = data;
 +
 +  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
 +    /* Due to quantization, we will usually find that many of the input
 +     * coefficients are zero, especially the AC terms.  We can exploit this
 +     * by short-circuiting the IDCT calculation for any row in which all
 +     * the AC terms are zero.  In that case each output is equal to the
 +     * DC coefficient (with scale factor as needed).
 +     * With typical images and quantization tables, half or more of the
 +     * row DCT calculations can be simplified this way.
 +     */
 +
 +    register int *idataptr = (int*)dataptr;
 +
 +    d0 = dataptr[0];
 +    d2 = dataptr[1];
 +    d4 = dataptr[2];
 +    d6 = dataptr[3];
 +
 +    if ((d2 | d4 | d6) == 0) {
 +      /* AC terms all zero */
 +      if (d0) {
 +          /* Compute a 32 bit value to assign. */
 +          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
 +          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
 +
 +          idataptr[0] = v;
 +          idataptr[1] = v;
 +      }
 +
 +      dataptr += DCTSTRIDE;     /* advance pointer to next row */
 +      continue;
 +    }
 +
 +    /* Even part: reverse the even part of the forward DCT. */
 +    /* The rotator is sqrt(2)*c(-6). */
 +    if (d6) {
 +            if (d2) {
 +                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
 +                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
 +                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
 +                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            } else {
 +                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
 +                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
 +                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            }
 +    } else {
 +            if (d2) {
 +                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
 +                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
 +                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            } else {
 +                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
 +                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
 +                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
 +            }
 +      }
 +
 +    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 +
 +    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
 +    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
 +    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
 +    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
 +
 +    dataptr += DCTSTRIDE;       /* advance pointer to next row */
 +  }
 +
 +  /* Pass 2: process columns. */
 +  /* Note that we must descale the results by a factor of 8 == 2**3, */
 +  /* and also undo the PASS1_BITS scaling. */
 +
 +  dataptr = data;
 +  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
 +    /* Columns of zeroes can be exploited in the same way as we did with rows.
 +     * However, the row calculation has created many nonzero AC terms, so the
 +     * simplification applies less often (typically 5% to 10% of the time).
 +     * On machines with very fast multiplication, it's possible that the
 +     * test takes more time than it's worth.  In that case this section
 +     * may be commented out.
 +     */
 +
 +    d0 = dataptr[DCTSTRIDE*0];
 +    d2 = dataptr[DCTSTRIDE*1];
 +    d4 = dataptr[DCTSTRIDE*2];
 +    d6 = dataptr[DCTSTRIDE*3];
 +
 +    /* Even part: reverse the even part of the forward DCT. */
 +    /* The rotator is sqrt(2)*c(-6). */
 +    if (d6) {
 +            if (d2) {
 +                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
 +                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
 +                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
 +                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            } else {
 +                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
 +                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
 +                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            }
 +    } else {
 +            if (d2) {
 +                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
 +                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
 +                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
 +
 +                    tmp0 = (d0 + d4) << CONST_BITS;
 +                    tmp1 = (d0 - d4) << CONST_BITS;
 +
 +                    tmp10 = tmp0 + tmp3;
 +                    tmp13 = tmp0 - tmp3;
 +                    tmp11 = tmp1 + tmp2;
 +                    tmp12 = tmp1 - tmp2;
 +            } else {
 +                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
 +                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
 +                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
 +            }
 +    }
 +
 +    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 +
 +    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
 +    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
 +    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
 +    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
 +
 +    dataptr++;                  /* advance pointer to next column */
 +  }
 +}
 +
 +void ff_j_rev_dct2(DCTBLOCK data){
 +  int d00, d01, d10, d11;
 +
 +  data[0] += 4;
 +  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
 +  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
 +  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
 +  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
 +
 +  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
 +  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
 +  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
 +  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
 +}
 +
 +void ff_j_rev_dct1(DCTBLOCK data){
 +  data[0] = (data[0] + 4)>>3;
 +}
 +
 +#undef FIX
 +#undef CONST_BITS
 +
- void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
+ void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
  {
      ff_j_rev_dct(block);
      ff_put_pixels_clamped(block, dest, line_size);
diff --cc libavcodec/mips/idctdsp_mips.h
index 19267e6,0000000..7ca7c1c
mode 100644,000000..100644
--- a/libavcodec/mips/idctdsp_mips.h
+++ b/libavcodec/mips/idctdsp_mips.h
@@@ -1,53 -1,0 +1,53 @@@
 +/*
 + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
 + *                    Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
 +#define AVCODEC_MIPS_IDCTDSP_MIPS_H
 +
 +#include "../mpegvideo.h"
 +
 +void ff_put_pixels_clamped_msa(const int16_t *block,
 +                               uint8_t *av_restrict pixels,
 +                               ptrdiff_t line_size);
 +void ff_put_signed_pixels_clamped_msa(const int16_t *block,
 +                                      uint8_t *av_restrict pixels,
 +                                      ptrdiff_t line_size);
 +void ff_add_pixels_clamped_msa(const int16_t *block,
 +                               uint8_t *av_restrict pixels,
 +                               ptrdiff_t line_size);
 +void ff_j_rev_dct_msa(int16_t *data);
- void ff_jref_idct_put_msa(uint8_t *dest, int32_t stride, int16_t *block);
- void ff_jref_idct_add_msa(uint8_t *dest, int32_t stride, int16_t *block);
++void ff_jref_idct_put_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_jref_idct_add_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 +void ff_simple_idct_msa(int16_t *block);
- void ff_simple_idct_put_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
- void ff_simple_idct_add_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
++void ff_simple_idct_put_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
++void ff_simple_idct_add_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
 +
 +void ff_put_pixels_clamped_mmi(const int16_t *block,
 +        uint8_t *av_restrict pixels, ptrdiff_t line_size);
 +void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
 +        uint8_t *av_restrict pixels, ptrdiff_t line_size);
 +void ff_add_pixels_clamped_mmi(const int16_t *block,
 +        uint8_t *av_restrict pixels, ptrdiff_t line_size);
 +void ff_simple_idct_mmi(int16_t *block);
- void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
- void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
++void ff_simple_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
++void ff_simple_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 +
 +#endif  // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
diff --cc libavcodec/mips/simple_idct_msa.c
index bd8b310,0000000..8a72359
mode 100644,000000..100644
--- a/libavcodec/mips/simple_idct_msa.c
+++ b/libavcodec/mips/simple_idct_msa.c
@@@ -1,573 -1,0 +1,573 @@@
 +/*
 + * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar at imgtec.com)
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavutil/mips/generic_macros_msa.h"
 +#include "idctdsp_mips.h"
 +
 +static void simple_idct_msa(int16_t *block)
 +{
 +    int32_t const_val;
 +    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
 +    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 +    v8i16 w1, w3, w5, w7;
 +    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
 +    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
 +    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
 +    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
 +    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
 +    v4i32 w2, w4, w6;
 +    v8i16 select_vec, temp;
 +    v8i16 zero = { 0 };
 +    v4i32 const_val0 = __msa_ldi_w(1);
 +    v4i32 const_val1 = __msa_ldi_w(1);
 +
 +    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 +    const_val0 <<= 10;
 +    const_val = 16383 * ((1 << 19) / 16383);
 +    const_val1 = __msa_insert_w(const_val0, 0, const_val);
 +    const_val1 = __msa_splati_w(const_val1, 0);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
 +    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    temp = in0 << 3;
 +    w2 = (v4i32) __msa_splati_h(weights, 2);
 +    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
 +    w4 = (v4i32) __msa_splati_h(weights, 4);
 +    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
 +    w6 = (v4i32) __msa_splati_h(weights, 6);
 +    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
 +    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
 +         temp1_r, temp1_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    UNPCK_SH_SW(in4, temp0_r, temp0_l);
 +    UNPCK_SH_SW(in6, temp3_r, temp3_l);
 +    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
 +    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
 +         temp2_r, temp2_l, temp1_r, temp1_l);
 +    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
 +    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
 +         a1_r, a1_l, a2_r, a2_l);
 +    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
 +         a3_r, a3_l, a0_r, a0_l);
 +    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
 +    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
 +    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
 +    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
 +    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
 +               const0, const1, const2, const3);
 +    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
 +    const5 = __msa_ilvod_h(-w1, -w5);
 +    const7 = __msa_ilvod_h(w3, -w1);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
 +                temp2_l, temp2_r, temp3_l, temp3_r,
 +                temp0_r, temp1_r, temp2_r, temp3_r);
 +    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
 +    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
 +    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
 +    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    w2 = (v4i32) __msa_splati_h(weights, 2);
 +    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
 +    w4 = (v4i32) __msa_splati_h(weights, 4);
 +    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
 +    w6 = (v4i32) __msa_splati_h(weights, 6);
 +    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
 +    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
 +         temp1_r, temp1_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    UNPCK_SH_SW(in4, temp0_r, temp0_l);
 +    UNPCK_SH_SW(in6, temp3_r, temp3_l);
 +    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
 +    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
 +         temp2_r, temp2_l, temp1_r, temp1_l);
 +    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
 +    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
 +         a1_r, a1_l, a2_r, a2_l);
 +    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
 +         a3_r, a3_l, a0_r, a0_l);
 +    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
 +    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
 +    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
 +    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
 +               const0, const1, const2, const3);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
 +    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
 +    const5 = __msa_ilvod_h(-w1, -w5);
 +    const7 = __msa_ilvod_h(w3, -w1);
 +    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
 +                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
 +           block, 8);
 +}
 +
 +static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
 +                                int16_t *block)
 +{
 +    int32_t const_val;
 +    uint64_t tmp0, tmp1, tmp2, tmp3;
 +    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
 +    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 +    v8i16 w1, w3, w5, w7;
 +    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
 +    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
 +    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
 +    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
 +    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
 +    v4i32 w2, w4, w6;
 +    v8i16 select_vec, temp;
 +    v8i16 zero = { 0 };
 +    v4i32 const_val0 = __msa_ldi_w(1);
 +    v4i32 const_val1 = __msa_ldi_w(1);
 +
 +    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 +    const_val0 <<= 10;
 +    const_val = 16383 * ((1 << 19) / 16383);
 +    const_val1 = __msa_insert_w(const_val0, 0, const_val);
 +    const_val1 = __msa_splati_w(const_val1, 0);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
 +    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    temp = in0 << 3;
 +    w2 = (v4i32) __msa_splati_h(weights, 2);
 +    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
 +    w4 = (v4i32) __msa_splati_h(weights, 4);
 +    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
 +    w6 = (v4i32) __msa_splati_h(weights, 6);
 +    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    UNPCK_SH_SW(in4, temp0_r, temp0_l);
 +    UNPCK_SH_SW(in6, temp3_r, temp3_l);
 +    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
 +    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
 +    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
 +    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
 +    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
 +    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
 +    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
 +    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
 +    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
 +    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
 +    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
 +               const0, const1, const2, const3);
 +    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
 +    const5 = __msa_ilvod_h(-w1, -w5);
 +    const7 = __msa_ilvod_h(w3, -w1);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
 +                temp2_l, temp2_r, temp3_l, temp3_r,
 +                temp0_r, temp1_r, temp2_r, temp3_r);
 +    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
 +    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
 +    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
 +    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    w2 = (v4i32) __msa_splati_h(weights, 2);
 +    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
 +    w4 = (v4i32) __msa_splati_h(weights, 4);
 +    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
 +    w6 = (v4i32) __msa_splati_h(weights, 6);
 +    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    UNPCK_SH_SW(in4, temp0_r, temp0_l);
 +    UNPCK_SH_SW(in6, temp3_r, temp3_l);
 +    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
 +    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
 +    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
 +    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
 +    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
 +    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
 +    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
 +    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
 +    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
 +    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
 +               const0, const1, const2, const3);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
 +    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
 +    const5 = __msa_ilvod_h(-w1, -w5);
 +    const7 = __msa_ilvod_h(w3, -w1);
 +    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
 +                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
 +    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
 +    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
 +    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
 +    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
 +                temp2_r, temp2_r, temp3_r, temp3_r,
 +                temp0_r, temp1_r, temp2_r, temp3_r);
 +    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
 +    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
 +    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
 +    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
 +    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 +    dst += 4 * dst_stride;
 +    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
 +    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
 +    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
 +    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
 +    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
 +                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
 +    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
 +    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
 +    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
 +    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
 +    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 +    dst += 4 * dst_stride;
 +}
 +
 +static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
 +                                int16_t *block)
 +{
 +    int32_t const_val;
 +    uint64_t tmp0, tmp1, tmp2, tmp3;
 +    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
 +    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
 +    v8i16 w1, w3, w5, w7;
 +    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
 +    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
 +    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
 +    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
 +    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
 +    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
 +    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
 +    v4i32 w2, w4, w6;
 +    v8i16 select_vec, temp;
 +    v8i16 zero = { 0 };
 +    v4i32 const_val0 = __msa_ldi_w(1);
 +    v4i32 const_val1 = __msa_ldi_w(1);
 +
 +    const_val0 <<= 10;
 +    const_val = 16383 * ((1 << 19) / 16383);
 +    const_val1 = __msa_insert_w(const_val0, 0, const_val);
 +    const_val1 = __msa_splati_w(const_val1, 0);
 +    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +
 +    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
 +    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    UNPCK_SH_SW(in4, temp4_r, temp4_l);
 +    UNPCK_SH_SW(in6, temp7_r, temp7_l);
 +    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
 +    temp = in0 << 3;
 +    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
 +    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
 +               const0, const1, const2, const3);
 +    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
 +    const5 = __msa_ilvod_h(-w1, -w5);
 +    const7 = __msa_ilvod_h(w3, -w1);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    w2 = (v4i32) __msa_splati_h(weights, 2);
 +    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
 +    w4 = (v4i32) __msa_splati_h(weights, 4);
 +    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
 +    w6 = (v4i32) __msa_splati_h(weights, 6);
 +    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
 +    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
 +    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
 +    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
 +    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
 +    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
 +    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
 +    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
 +    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
 +    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
 +    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
 +                temp2_l, temp2_r, temp3_l, temp3_r,
 +                temp0_r, temp1_r, temp2_r, temp3_r);
 +    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
 +                               (v16u8) select_vec);
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
 +    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
 +    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
 +    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
 +    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 +                       in0, in1, in2, in3, in4, in5, in6, in7);
 +
 +    UNPCK_SH_SW(in0, a0_r, a0_l);
 +    UNPCK_SH_SW(in2, temp3_r, temp3_l);
 +    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
 +    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
 +    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
 +                temp2_l, temp2_r, temp1_l, temp1_r,
 +                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
 +    UNPCK_SH_SW(in4, temp0_r, temp0_l);
 +    UNPCK_SH_SW(in6, temp3_r, temp3_l);
 +    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
 +    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
 +    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
 +    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
 +    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
 +    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
 +    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
 +    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
 +    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
 +    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
 +    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
 +    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
 +    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
 +    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
 +                b0_r, b1_r, b2_r, b3_r);
 +    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
 +                b0_l, b1_l, b2_l, b3_l);
 +    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
 +                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
 +    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
 +                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
 +    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
 +                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
 +                 temp0_r, temp0_l, temp1_r, temp1_l,
 +                 temp2_r, temp2_l, temp3_r, temp3_l,
 +                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
 +    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
 +    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
 +    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
 +    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
 +                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
 +    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
 +               temp0_l, temp1_l, temp2_l, temp3_l);
 +    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
 +    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
 +    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
 +    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
 +    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
 +    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
 +    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
 +    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
 +    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
 +                temp2_r, temp2_r, temp3_r, temp3_r,
 +                temp0_r, temp1_r, temp2_r, temp3_r);
 +    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
 +    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
 +    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
 +    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
 +    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 +
 +    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
 +    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
 +    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
 +    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
 +                a0_r, a1_r, a2_r, a3_r);
 +    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
 +               a3_l, a2_l, a1_l, a0_l);
 +    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
 +    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
 +    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
 +    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
 +    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
 +    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
 +    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
 +    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
 +    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
 +                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
 +    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
 +    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
 +    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
 +    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
 +    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
 +}
 +
 +void ff_simple_idct_msa(int16_t *block)
 +{
 +    simple_idct_msa(block);
 +}
 +
- void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
++void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
 +{
 +    simple_idct_put_msa(dst, dst_stride, block);
 +}
 +
- void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
++void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
 +{
 +    simple_idct_add_msa(dst, dst_stride, block);
 +}
diff --cc libavcodec/mips/vc1dsp_mips.h
index 2cdd900,0000000..b9b07e1
mode 100644,000000..100644
--- a/libavcodec/mips/vc1dsp_mips.h
+++ b/libavcodec/mips/vc1dsp_mips.h
@@@ -1,194 -1,0 +1,194 @@@
 +/*
 + * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_MIPS_VC1DSP_MIPS_H
 +#define AVCODEC_MIPS_VC1DSP_MIPS_H
 +
 +#include "libavcodec/vc1dsp.h"
 +
 +void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc01_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc02_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc03_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc10_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc11_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc12_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc13_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc20_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc21_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc22_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc23_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc30_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc31_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc32_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc33_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +
 +void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc01_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc02_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc03_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc10_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc11_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc12_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc13_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc20_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc21_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc22_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc23_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc30_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc31_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc32_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc33_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd);
 +
 +
 +void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc01_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc02_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc03_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc10_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc11_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc12_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc13_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc20_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc21_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc22_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc23_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc30_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc31_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc32_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_put_vc1_mspel_mc33_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +
 +void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc01_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc02_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc03_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc10_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc11_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc12_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc13_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc20_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc21_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc22_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc23_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc30_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc31_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc32_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +void ff_avg_vc1_mspel_mc33_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd);
 +
 +void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]);
- void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, int linesize, int16_t *block);
- void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, int linesize, int16_t *block);
- void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, int linesize, int16_t *block);
++void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
++void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
++void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
 +
- void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
- void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
- void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
- void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
++void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
 +
 +void ff_vc1_v_overlap_mmi(uint8_t *src, int stride);
 +void ff_vc1_h_overlap_mmi(uint8_t *src, int stride);
 +void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom);
 +void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right);
 +
 +void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq);
 +void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq);
 +void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq);
 +void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq);
 +void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq);
 +void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq);
 +
 +void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y);
 +void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y);
 +void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y);
 +void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y);
 +
 +#endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */
diff --cc libavcodec/mips/vc1dsp_mmi.c
index dfae2d9,0000000..01e7f9f
mode 100644,000000..100644
--- a/libavcodec/mips/vc1dsp_mmi.c
+++ b/libavcodec/mips/vc1dsp_mmi.c
@@@ -1,2081 -1,0 +1,2081 @@@
 +/*
 + * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
 + *
 + * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavutil/avassert.h"
 +#include "libavcodec/vc1dsp.h"
 +#include "constants.h"
 +#include "vc1dsp_mips.h"
 +#include "hpeldsp_mips.h"
 +#include "libavutil/mips/mmiutils.h"
 +
 +
 +#define VC1_INV_TRANCS_8_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
 +                                   o1,    o2,    o3,    o4,                 \
 +                                   t1,    t2,    t3,    t4,                 \
 +                                   ff_p1, ff_p2, ff_p3, ff_p4)              \
 +        "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p1"                \n\t"   \
 +        "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p2"                \n\t"   \
 +        "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p4"                \n\t"   \
 +        "paddh      "#o1"   ,   "#t1"   ,   "#t2"                   \n\t"   \
 +        "paddh      "#o1"   ,   "#o1"   ,   "#t3"                   \n\t"   \
 +        "paddh      "#o1"   ,   "#o1"   ,   "#t4"                   \n\t"   \
 +                                                                            \
 +        "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p2"                \n\t"   \
 +        "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p4"                \n\t"   \
 +        "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p1"                \n\t"   \
 +        "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
 +        "psubh      "#o2"   ,   "#t1"   ,   "#t2"                   \n\t"   \
 +        "psubh      "#o2"   ,   "#o2"   ,   "#t3"                   \n\t"   \
 +        "psubh      "#o2"   ,   "#o2"   ,   "#t4"                   \n\t"   \
 +                                                                            \
 +        "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p1"                \n\t"   \
 +        "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p4"                \n\t"   \
 +        "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
 +        "psubh      "#o3"   ,   "#t1"   ,   "#t2"                   \n\t"   \
 +        "paddh      "#o3"   ,   "#o3"   ,   "#t3"                   \n\t"   \
 +        "paddh      "#o3"   ,   "#o3"   ,   "#t4"                   \n\t"   \
 +                                                                            \
 +        "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p4"                \n\t"   \
 +        "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
 +        "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p1"                \n\t"   \
 +        "psubh      "#o4"   ,   "#t1"   ,   "#t2"                   \n\t"   \
 +        "paddh      "#o4"   ,   "#o4"   ,   "#t3"                   \n\t"   \
 +        "psubh      "#o4"   ,   "#o4"   ,   "#t4"                   \n\t"
 +
 +
 +#define VC1_INV_TRANCS_8_STEP2_MMI(fp1,   fp2,   fp3,   fp4,                \
 +                                   fp5,   fp6,   fp7,   fp8,                \
 +                                   o1,    o2,    o3,    o4,                 \
 +                                   ff_p1, ff_p2, ff_p3, ff_pw)              \
 +        "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
 +        "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
 +        "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
 +        "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
 +                                                                            \
 +        "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
 +        "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
 +        "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
 +                                                                            \
 +        "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
 +        "paddh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 +        "psubh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 +        "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
 +                                                                            \
 +        "paddh      "#fp5"  ,   "#fp1"  ,   "#o1"                   \n\t"   \
 +        "paddh      "#fp6"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
 +        "paddh      "#fp7"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
 +        "paddh      "#fp8"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
 +                                                                            \
 +        "psubh      "#fp4"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
 +        "psubh      "#fp3"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
 +        "psubh      "#fp2"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
 +        "psubh      "#fp1"  ,   "#fp1"  ,   "#o1"                   \n\t"
 +
 +
 +#define VC1_INV_TRANCS_4_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
 +                                   fp5,   fp6,   fp7,   fp8,                \
 +                                   ff_p1, ff_p2, ff_p3, ff_pw)              \
 +        "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
 +        "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
 +        "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
 +        "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
 +                                                                            \
 +        "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
 +        "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
 +        "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
 +        "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 +        "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
 +                                                                            \
 +        "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
 +        "psubh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 +        "paddh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 +        "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"
 +
 +
 +#define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,                      \
 +                                   fp5, fp6, fp7, fp8, zero)                \
 +        "punpcklbh  "#fp5"  ,   "#fp5"  ,   "#zero"                 \n\t"   \
 +        "punpcklbh  "#fp6"  ,   "#fp6"  ,   "#zero"                 \n\t"   \
 +        "punpcklbh  "#fp7"  ,   "#fp7"  ,   "#zero"                 \n\t"   \
 +        "punpcklbh  "#fp8"  ,   "#fp8"  ,   "#zero"                 \n\t"   \
 +                                                                            \
 +        "paddh      "#fp1"  ,   "#fp1"  ,   "#fp5"                  \n\t"   \
 +        "paddh      "#fp2"  ,   "#fp2"  ,   "#fp6"                  \n\t"   \
 +        "paddh      "#fp3"  ,   "#fp3"  ,   "#fp7"                  \n\t"   \
 +        "paddh      "#fp4"  ,   "#fp4"  ,   "#fp8"                  \n\t"   \
 +                                                                            \
 +        "packushb   "#fp1"  ,   "#fp1"  ,   "#zero"                 \n\t"   \
 +        "packushb   "#fp2"  ,   "#fp2"  ,   "#zero"                 \n\t"   \
 +        "packushb   "#fp3"  ,   "#fp3"  ,   "#zero"                 \n\t"   \
 +        "packushb   "#fp4"  ,   "#fp4"  ,   "#zero"                 \n\t"
 +
 +
 +/* Do inverse transform on 8x8 block */
- void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int dc = block[0];
 +    double ftmp[9];
 +    mips_reg addr[1];
 +    int count;
 +
 +    dc = (3 * dc +  1) >> 1;
 +    dc = (3 * dc + 16) >> 5;
 +
 +    __asm__ volatile(
 +        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 +        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 +        "li         %[count],   0x02                                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_LDC1(%[ftmp1], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 +        MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 +        MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 +        MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 +
 +        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +
 +        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 +
 +        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 +        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 +        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 +        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 +
 +        MMI_SDC1(%[ftmp1], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 +        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 +        MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 +        MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 +
 +        "addiu      %[count],   %[count],       -0x01                   \n\t"
 +        PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 +        "bnez       %[count],   1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),
 +          [addr0]"=&r"(addr[0]),
 +          [count]"=&r"(count),          [dest]"+&r"(dest)
 +        : [linesize]"r"((mips_reg)linesize),
 +          [dc]"f"(dc)
 +        : "memory"
 +    );
 +}
 +
 +#if _MIPS_SIM != _ABIO32
 +void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 +{
 +    DECLARE_ALIGNED(16, int16_t, temp[64]);
 +    int16_t *src = block;
 +    int16_t *dst = temp;
 +    double ftmp[16];
 +    uint32_t count, tmp[1];
 +
 +    // 1st loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x03                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +        "li         %[count],   0x02                                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_LDC1(%[ftmp5], %[src], 0x10)
 +        MMI_LDC1(%[ftmp6], %[src], 0x30)
 +        MMI_LDC1(%[ftmp7], %[src], 0x50)
 +        MMI_LDC1(%[ftmp8], %[src], 0x70)
 +
 +        VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 +                                   %[ff_pw_4])
 +
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x40)
 +        MMI_LDC1(%[ftmp3], %[src], 0x20)
 +        MMI_LDC1(%[ftmp4], %[src], 0x60)
 +
 +        VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 +                                   %[ff_pw_4])
 +
 +
 +        PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                    %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 +
 +        TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        MMI_SDC1(%[ftmp5], %[dst], 0x00)
 +        MMI_SDC1(%[ftmp6], %[dst], 0x10)
 +        MMI_SDC1(%[ftmp7], %[dst], 0x20)
 +        MMI_SDC1(%[ftmp8], %[dst], 0x30)
 +
 +        TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        MMI_SDC1(%[ftmp4], %[dst], 0x08)
 +        MMI_SDC1(%[ftmp3], %[dst], 0x18)
 +        MMI_SDC1(%[ftmp2], %[dst], 0x28)
 +        MMI_SDC1(%[ftmp1], %[dst], 0x38)
 +
 +        "addiu      %[count],   %[count],  -0x01                        \n\t"
 +        PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 +        PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 +        "bnez       %[count],   1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 +          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 +          [tmp0]"=&r"(tmp[0]),
 +          [count]"=&r"(count),
 +          [src]"+&r"(src),              [dst]"+&r"(dst)
 +        : [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 +          [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 +          [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 +        : "memory"
 +    );
 +
 +    src = temp;
 +    dst = block;
 +
 +    // 2nd loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x07                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +        "li         %[count],   0x02                                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_LDC1(%[ftmp5], %[src], 0x10)
 +        MMI_LDC1(%[ftmp6], %[src], 0x30)
 +        MMI_LDC1(%[ftmp7], %[src], 0x50)
 +        MMI_LDC1(%[ftmp8], %[src], 0x70)
 +
 +        VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 +                                   %[ff_pw_4])
 +
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x40)
 +        MMI_LDC1(%[ftmp3], %[src], 0x20)
 +        MMI_LDC1(%[ftmp4], %[src], 0x60)
 +
 +        VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 +                                   %[ff_pw_64])
 +
 +        "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 +
 +        PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                    %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 +
 +        MMI_SDC1(%[ftmp5], %[dst], 0x00)
 +        MMI_SDC1(%[ftmp6], %[dst], 0x10)
 +        MMI_SDC1(%[ftmp7], %[dst], 0x20)
 +        MMI_SDC1(%[ftmp8], %[dst], 0x30)
 +
 +        MMI_SDC1(%[ftmp4], %[dst], 0x40)
 +        MMI_SDC1(%[ftmp3], %[dst], 0x50)
 +        MMI_SDC1(%[ftmp2], %[dst], 0x60)
 +        MMI_SDC1(%[ftmp1], %[dst], 0x70)
 +
 +        "addiu      %[count],   %[count],  -0x01                        \n\t"
 +        PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 +        PTR_ADDIU  "%[dst],     %[dst],     0x08                        \n\t"
 +        "bnez       %[count],   1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 +          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 +          [tmp0]"=&r"(tmp[0]),
 +          [count]"=&r"(count),
 +          [src]"+&r"(src),              [dst]"+&r"(dst)
 +        : [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 +          [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 +          [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 +          [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 +        : "memory"
 +    );
 +}
 +#endif
 +
 +/* Do inverse transform on 8x4 part of block */
- void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int dc = block[0];
 +    double ftmp[9];
 +
 +    dc = ( 3 * dc +  1) >> 1;
 +    dc = (17 * dc + 64) >> 7;
 +
 +    __asm__ volatile(
 +        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 +        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 +
 +        MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 +
 +        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +
 +        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 +
 +        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 +        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 +        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 +        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 +
 +        MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8])
 +        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 +          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 +          [dc]"f"(dc)
 +        : "memory"
 +    );
 +}
 +
 +#if _MIPS_SIM != _ABIO32
- void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int16_t *src = block;
 +    int16_t *dst = block;
 +    double ftmp[16];
 +    uint32_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_LOW32;
 +
 +    // 1st loop
 +    __asm__ volatile (
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x08)
 +        MMI_LDC1(%[ftmp3], %[src], 0x10)
 +        MMI_LDC1(%[ftmp4], %[src], 0x18)
 +        MMI_LDC1(%[ftmp5], %[src], 0x20)
 +        MMI_LDC1(%[ftmp6], %[src], 0x28)
 +        MMI_LDC1(%[ftmp7], %[src], 0x30)
 +        MMI_LDC1(%[ftmp8], %[src], 0x38)
 +
 +        //             a1        b1        a3        b2
 +        TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        //             a2        b3        a4        b4
 +        TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        // input b1 b2 b3 b4
 +        VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
 +                                   %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 +                                   %[ff_pw_4])
 +        // input a1 a2 a3 a4
 +        VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
 +                                   %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 +                                   %[ff_pw_4])
 +
 +        "li         %[tmp0],    0x03                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +
 +        PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 +                    %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
 +
 +        TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        MMI_SDC1(%[ftmp3], %[dst], 0x00)
 +        MMI_SDC1(%[ftmp7], %[dst], 0x10)
 +        MMI_SDC1(%[ftmp4], %[dst], 0x20)
 +        MMI_SDC1(%[ftmp8], %[dst], 0x30)
 +
 +        TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
 +                     %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                     %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 +
 +        MMI_SDC1(%[ftmp6], %[dst], 0x08)
 +        MMI_SDC1(%[ftmp5], %[dst], 0x18)
 +        MMI_SDC1(%[ftmp2], %[dst], 0x28)
 +        MMI_SDC1(%[ftmp1], %[dst], 0x38)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 +          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 +          [tmp0]"=&r"(tmp[0])
 +        : [src]"r"(src),                [dst]"r"(dst),
 +          [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 +          [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 +          [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 +        : "memory"
 +    );
 +
 +    src = block;
 +
 +    // 2nd loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x07                                    \n\t"
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
 +
 +        // dest low 32bit
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x20)
 +        MMI_LDC1(%[ftmp3], %[src], 0x30)
 +        MMI_LDC1(%[ftmp4], %[src], 0x10)
 +
 +        VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 +                                   %[ff_pw_64])
 +
 +        PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 +
 +        MMI_LWC1(%[ftmp5], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 +
 +        VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp0])
 +
 +        MMI_SWC1(%[ftmp1], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 +
 +        // dest high 32bit
 +        MMI_LDC1(%[ftmp1], %[src], 0x08)
 +        MMI_LDC1(%[ftmp2], %[src], 0x28)
 +        MMI_LDC1(%[ftmp3], %[src], 0x38)
 +        MMI_LDC1(%[ftmp4], %[src], 0x18)
 +
 +        VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 +                                   %[ff_pw_64])
 +
 +        PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 +
 +        MMI_LWC1(%[ftmp5], %[dest], 0x04)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp6], %[addr0], 0x04)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp7], %[addr0], 0x04)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp8], %[addr0], 0x04)
 +
 +        VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp0])
 +
 +        MMI_SWC1(%[ftmp1], %[dest], 0x04)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp2], %[addr0], 0x04)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp3], %[addr0], 0x04)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp4], %[addr0], 0x04)
 +
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_LOW32
 +          [addr0]"=&r"(addr[0])
 +        : [src]"r"(src),                [dest]"r"(dest),
 +          [linesize]"r"((mips_reg)linesize),
 +          [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 +          [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 +        : "memory"
 +    );
 +}
 +#endif
 +
 +/* Do inverse transform on 4x8 parts of block */
- void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int dc = block[0];
 +    double ftmp[9];
 +    DECLARE_VAR_LOW32;
 +
 +    dc = (17 * dc +  4) >> 3;
 +    dc = (12 * dc + 64) >> 7;
 +
 +    __asm__ volatile(
 +        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 +        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 +
 +        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 +        MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 +        MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 +        MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 +        MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 +
 +        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 +
 +        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 +
 +        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 +
 +        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 +        MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 +        MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 +        MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 +        MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          RESTRICT_ASM_LOW32
 +          [ftmp8]"=&f"(ftmp[8])
 +        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 +          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 +          [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 +          [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 +          [dc]"f"(dc)
 +        : "memory"
 +    );
 +}
 +
 +#if _MIPS_SIM != _ABIO32
- void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int16_t *src = block;
 +    int16_t *dst = block;
 +    double ftmp[16];
 +    uint32_t count, tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_LOW32;
 +
 +    // 1st loop
 +    __asm__ volatile (
 +        "li         %[count],   0x02                                    \n\t"
 +        "li         %[tmp0],    0x03                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x10)
 +        MMI_LDC1(%[ftmp3], %[src], 0x20)
 +        MMI_LDC1(%[ftmp4], %[src], 0x30)
 +
 +        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                     %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 +
 +        //                              t1        t2        t3        t4
 +        VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 +                                   %[ff_pw_4])
 +
 +        PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 +
 +        TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 +                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                     %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 +
 +        MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +        MMI_SDC1(%[ftmp3], %[dst], 0x10)
 +        MMI_SDC1(%[ftmp4], %[dst], 0x20)
 +        MMI_SDC1(%[ftmp2], %[dst], 0x30)
 +
 +        "addiu      %[count],   %[count],  -0x01                        \n\t"
 +        PTR_ADDIU  "%[src],     %[src],     0x40                        \n\t"
 +        PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 +        "bnez       %[count],   1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [tmp0]"=&r"(tmp[0]),
 +          [count]"=&r"(count),
 +          [src]"+&r"(src),              [dst]"+&r"(dst)
 +        : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 +          [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 +        : "memory"
 +    );
 +
 +    src = block;
 +
 +    // 2nd loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x07                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +
 +        MMI_LDC1(%[ftmp5], %[src], 0x10)
 +        MMI_LDC1(%[ftmp6], %[src], 0x30)
 +        MMI_LDC1(%[ftmp7], %[src], 0x50)
 +        MMI_LDC1(%[ftmp8], %[src], 0x70)
 +
 +        VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 +                                   %[ff_pw_4])
 +
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x40)
 +        MMI_LDC1(%[ftmp3], %[src], 0x20)
 +        MMI_LDC1(%[ftmp4], %[src], 0x60)
 +
 +        VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 +                                   %[ff_pw_64])
 +
 +        "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 +
 +        PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                    %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 +
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +
 +        // dest low
 +        MMI_LWC1(%[ftmp9], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +
 +        VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp0])
 +
 +        // dest high
 +        MMI_LWC1(%[ftmp9], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 +
 +        VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 +                                   %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 +                                   %[ftmp0])
 +
 +        // dest low
 +        MMI_SWC1(%[ftmp5], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp7], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp8], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +
 +        // dest high
 +        MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp1], %[addr0], 0x00)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [ftmp12]"=&f"(ftmp[12]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_LOW32
 +          [addr0]"=&r"(addr[0]),
 +          [dest]"+&r"(dest)
 +        : [src]"r"(src),                [linesize]"r"(linesize),
 +          [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 +          [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 +          [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 +          [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 +        : "memory"
 +    );
 +}
 +#endif
 +
 +/* Do inverse transform on 4x4 part of block */
- void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int dc = block[0];
 +    double ftmp[5];
 +    DECLARE_VAR_LOW32;
 +
 +    dc = (17 * dc +  4) >> 3;
 +    dc = (17 * dc + 64) >> 7;
 +
 +    __asm__ volatile(
 +        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 +        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 +
 +        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 +
 +        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +
 +        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 +        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 +
 +        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 +        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 +
 +        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 +        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 +        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 +        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          RESTRICT_ASM_LOW32
 +          [ftmp4]"=&f"(ftmp[4])
 +        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 +          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 +          [dc]"f"(dc)
 +        : "memory"
 +    );
 +}
 +
- void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, int linesize, int16_t *block)
++void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +{
 +    int16_t *src = block;
 +    int16_t *dst = block;
 +    double ftmp[12];
 +    uint32_t tmp[1];
 +    mips_reg addr[1];
 +    DECLARE_VAR_LOW32;
 +
 +    // 1st loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x03                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x10)
 +        MMI_LDC1(%[ftmp3], %[src], 0x20)
 +        MMI_LDC1(%[ftmp4], %[src], 0x30)
 +
 +        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                     %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 +
 +        //                              t1        t2        t3        t4
 +        VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 +                                   %[ff_pw_4])
 +
 +        PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 +
 +        TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 +                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                     %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 +
 +        MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +        MMI_SDC1(%[ftmp3], %[dst], 0x10)
 +        MMI_SDC1(%[ftmp4], %[dst], 0x20)
 +        MMI_SDC1(%[ftmp2], %[dst], 0x30)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 +          [tmp0]"=&r"(tmp[0]),
 +          [src]"+&r"(src),              [dst]"+&r"(dst)
 +        : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 +          [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 +        : "memory"
 +    );
 +
 +    src = block;
 +
 +    // 2nd loop
 +    __asm__ volatile (
 +        "li         %[tmp0],    0x07                                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 +
 +        // dest low 32bit
 +        MMI_LDC1(%[ftmp1], %[src], 0x00)
 +        MMI_LDC1(%[ftmp2], %[src], 0x20)
 +        MMI_LDC1(%[ftmp3], %[src], 0x30)
 +        MMI_LDC1(%[ftmp4], %[src], 0x10)
 +
 +        VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 +                                   %[ff_pw_64])
 +
 +        PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
 +
 +        MMI_LWC1(%[ftmp5], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 +
 +        "xor        %[ftmp9],   %[ftmp9],  %[ftmp9]                     \n\t"
 +
 +        VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 +                                   %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 +                                   %[ftmp9])
 +
 +        MMI_SWC1(%[ftmp1], %[dest], 0x00)
 +        PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 +        PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 +        MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_LOW32
 +          [addr0]"=&r"(addr[0])
 +        : [src]"r"(src),                [dest]"r"(dest),
 +          [linesize]"r"((mips_reg)linesize),
 +          [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 +          [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 +        : "memory"
 +    );
 +}
 +
 +/* Apply overlap transform to horizontal edge */
 +void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
 +{
 +    int i;
 +    int a, b, c, d;
 +    int d1, d2;
 +    int rnd = 1;
 +    for (i = 0; i < 8; i++) {
 +        a  = src[-2];
 +        b  = src[-1];
 +        c  = src[0];
 +        d  = src[1];
 +        d1 = (a - d + 3 + rnd) >> 3;
 +        d2 = (a - d + b - c + 4 - rnd) >> 3;
 +
 +        src[-2] = a - d1;
 +        src[-1] = av_clip_uint8(b - d2);
 +        src[0]  = av_clip_uint8(c + d2);
 +        src[1]  = d + d1;
 +        src    += stride;
 +        rnd     = !rnd;
 +    }
 +}
 +
 +void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right)
 +{
 +    int i;
 +    int a, b, c, d;
 +    int d1, d2;
 +    int rnd1 = 4, rnd2 = 3;
 +    for (i = 0; i < 8; i++) {
 +        a  = left[6];
 +        b  = left[7];
 +        c  = right[0];
 +        d  = right[1];
 +        d1 = a - d;
 +        d2 = a - d + b - c;
 +
 +        left[6]  = ((a << 3) - d1 + rnd1) >> 3;
 +        left[7]  = ((b << 3) - d2 + rnd2) >> 3;
 +        right[0] = ((c << 3) + d2 + rnd1) >> 3;
 +        right[1] = ((d << 3) + d1 + rnd2) >> 3;
 +
 +        right += 8;
 +        left  += 8;
 +        rnd2   = 7 - rnd2;
 +        rnd1   = 7 - rnd1;
 +    }
 +}
 +
 +/* Apply overlap transform to vertical edge */
 +void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
 +{
 +    int i;
 +    int a, b, c, d;
 +    int d1, d2;
 +    int rnd = 1;
 +    for (i = 0; i < 8; i++) {
 +        a  = src[-2 * stride];
 +        b  = src[-stride];
 +        c  = src[0];
 +        d  = src[stride];
 +        d1 = (a - d + 3 + rnd) >> 3;
 +        d2 = (a - d + b - c + 4 - rnd) >> 3;
 +
 +        src[-2 * stride] = a - d1;
 +        src[-stride]     = av_clip_uint8(b - d2);
 +        src[0]           = av_clip_uint8(c + d2);
 +        src[stride]      = d + d1;
 +        src++;
 +        rnd = !rnd;
 +    }
 +}
 +
 +void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
 +{
 +    int i;
 +    int a, b, c, d;
 +    int d1, d2;
 +    int rnd1 = 4, rnd2 = 3;
 +    for (i = 0; i < 8; i++) {
 +        a  = top[48];
 +        b  = top[56];
 +        c  = bottom[0];
 +        d  = bottom[8];
 +        d1 = a - d;
 +        d2 = a - d + b - c;
 +
 +        top[48]   = ((a << 3) - d1 + rnd1) >> 3;
 +        top[56]   = ((b << 3) - d2 + rnd2) >> 3;
 +        bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
 +        bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
 +
 +        bottom++;
 +        top++;
 +        rnd2 = 7 - rnd2;
 +        rnd1 = 7 - rnd1;
 +    }
 +}
 +
 +/**
 + * VC-1 in-loop deblocking filter for one line
 + * @param src source block type
 + * @param stride block stride
 + * @param pq block quantizer
 + * @return whether other 3 pairs should be filtered or not
 + * @see 8.6
 + */
 +static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
 +{
 +    int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
 +              5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
 +    int a0_sign = a0 >> 31;        /* Store sign */
 +
 +    a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
 +    if (a0 < pq) {
 +        int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
 +                        5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
 +        int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
 +                        5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
 +        if (a1 < a0 || a2 < a0) {
 +            int clip      = src[-1 * stride] - src[0 * stride];
 +            int clip_sign = clip >> 31;
 +
 +            clip = ((clip ^ clip_sign) - clip_sign) >> 1;
 +            if (clip) {
 +                int a3     = FFMIN(a1, a2);
 +                int d      = 5 * (a3 - a0);
 +                int d_sign = (d >> 31);
 +
 +                d       = ((d ^ d_sign) - d_sign) >> 3;
 +                d_sign ^= a0_sign;
 +
 +                if (d_sign ^ clip_sign)
 +                    d = 0;
 +                else {
 +                    d = FFMIN(d, clip);
 +                    d = (d ^ d_sign) - d_sign; /* Restore sign */
 +                    src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
 +                    src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
 +                }
 +                return 1;
 +            }
 +        }
 +    }
 +    return 0;
 +}
 +
 +/**
 + * VC-1 in-loop deblocking filter
 + * @param src source block type
 + * @param step distance between horizontally adjacent elements
 + * @param stride distance between vertically adjacent elements
 + * @param len edge length to filter (4 or 8 pixels)
 + * @param pq block quantizer
 + * @see 8.6
 + */
 +static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
 +                                   int len, int pq)
 +{
 +    int i;
 +    int filt3;
 +
 +    for (i = 0; i < len; i += 4) {
 +        filt3 = vc1_filter_line(src + 2 * step, stride, pq);
 +        if (filt3) {
 +            vc1_filter_line(src + 0 * step, stride, pq);
 +            vc1_filter_line(src + 1 * step, stride, pq);
 +            vc1_filter_line(src + 3 * step, stride, pq);
 +        }
 +        src += step * 4;
 +    }
 +}
 +
 +void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, 1, stride, 4, pq);
 +}
 +
 +void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, stride, 1, 4, pq);
 +}
 +
 +void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, 1, stride, 8, pq);
 +}
 +
 +void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, stride, 1, 8, pq);
 +}
 +
 +void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, 1, stride, 16, pq);
 +}
 +
 +void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
 +{
 +    vc1_loop_filter(src, stride, 1, 16, pq);
 +}
 +
 +void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd)
 +{
 +    ff_put_pixels8_8_mmi(dst, src, stride, 8);
 +}
 +void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd)
 +{
 +    ff_put_pixels16_8_mmi(dst, src, stride, 16);
 +}
 +void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
 +                               ptrdiff_t stride, int rnd)
 +{
 +    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
 +}
 +void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
 +                                  ptrdiff_t stride, int rnd)
 +{
 +    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
 +}
 +
 +#define OP_PUT(S, D)
 +#define OP_AVG(S, D)                                                        \
 +    "ldc1       $f16,   "#S"                        \n\t"                   \
 +    "pavgb      "#D",   "#D",   $f16                \n\t"
 +
 +/** Add rounder from $f14 to $f6 and pack result at destination */
 +#define NORMALIZE_MMI(SHIFT)                                                \
 +    "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
 +    "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
 +    "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
 +    "psrah      $f8,    $f8,    "SHIFT"             \n\t"
 +
 +#define TRANSFER_DO_PACK(OP)                                                \
 +    "packushb   $f6,    $f6,    $f8                 \n\t"                   \
 +    OP((%[dst]), $f6)                                                       \
 +    "sdc1       $f6,    0x00(%[dst])                \n\t"
 +
 +#define TRANSFER_DONT_PACK(OP)                                              \
 +     OP(0(%[dst]), $f6)                                                     \
 +     OP(8(%[dst]), $f8)                                                     \
 +     "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
 +     "sdc1      $f8,    0x08(%[dst])                \n\t"
 +
 +/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
 +#define DO_UNPACK(reg)                                                      \
 +    "punpcklbh  "reg",  "reg",  $f0                 \n\t"
 +#define DONT_UNPACK(reg)
 +
 +/** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
 +#define LOAD_ROUNDER_MMI(ROUND)                                             \
 +    "lwc1       $f14,   "ROUND"                     \n\t"                   \
 +    "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
 +    "punpcklwd  $f14,   $f14,   $f14                \n\t"
 +
 +
 +#define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
 +    "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
 +    PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
 +    MMI_ULWC1(R0, $9, 0x00)                                                 \
 +    "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
 +    "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
 +    PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
 +    MMI_ULWC1(R3, $9, 0x00)                                                 \
 +    "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
 +    "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
 +    "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
 +    "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
 +    "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
 +    MMI_SDC1(R1, %[dst], OFF)                                               \
 +    PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
 +
 +/** Sacrificing $f12 makes it possible to pipeline loads from src */
 +static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
 +                                       const uint8_t *src, mips_reg stride,
 +                                       int rnd, int64_t shift)
 +{
 +    DECLARE_VAR_LOW32;
 +    DECLARE_VAR_ADDRT;
 +
 +    __asm__ volatile(
 +        "xor        $f0,    $f0,    $f0             \n\t"
 +        "li         $8,     0x03                    \n\t"
 +        LOAD_ROUNDER_MMI("%[rnd]")
 +        "ldc1       $f12,   %[ff_pw_9]              \n\t"
 +        "1:                                         \n\t"
 +        MMI_ULWC1($f4, %[src], 0x00)
 +        PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
 +        MMI_ULWC1($f6, %[src], 0x00)
 +        "punpcklbh  $f4,    $f4,    $f0             \n\t"
 +        "punpcklbh  $f6,    $f6,    $f0             \n\t"
 +        SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
 +        SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
 +        SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
 +        SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
 +        SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
 +        SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
 +        SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
 +        SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
 +        PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
 +        PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
 +        "addiu      $8,     $8,    -0x01            \n\t"
 +        "bnez       $8,     1b                      \n\t"
 +        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
 +          [src]"+r"(src),               [dst]"+r"(dst)
 +        : [stride]"r"(stride),          [stride1]"r"(-2*stride),
 +          [shift]"f"(shift),            [rnd]"m"(rnd),
 +          [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
 +        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
 +          "$f14", "$f16", "memory"
 +    );
 +}
 +
 +/**
 + * Data is already unpacked, so some operations can directly be made from
 + * memory.
 + */
 +#define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
 +static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
 +                                             const int16_t *src, int rnd)   \
 +{                                                                           \
 +    int h = 8;                                                              \
 +    DECLARE_VAR_ALL64;                                                      \
 +    DECLARE_VAR_ADDRT;                                                      \
 +                                                                            \
 +    src -= 1;                                                               \
 +    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
 +                                                                            \
 +    __asm__ volatile(                                                       \
 +        LOAD_ROUNDER_MMI("%[rnd]")                                          \
 +        "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
 +        "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
 +        "1:                                         \n\t"                   \
 +        MMI_ULDC1($f2, %[src], 0x00)                                        \
 +        MMI_ULDC1($f4, %[src], 0x08)                                        \
 +        MMI_ULDC1($f6, %[src], 0x02)                                        \
 +        MMI_ULDC1($f8, %[src], 0x0a)                                        \
 +        MMI_ULDC1($f0, %[src], 0x06)                                        \
 +        "paddh      $f2,    $f2,    $f0             \n\t"                   \
 +        MMI_ULDC1($f0, %[src], 0x0e)                                        \
 +        "paddh      $f4,    $f4,    $f0             \n\t"                   \
 +        MMI_ULDC1($f0, %[src], 0x04)                                        \
 +        "paddh      $f6,    $f6,    $f0             \n\t"                   \
 +        MMI_ULDC1($f0, %[src], 0x0b)                                        \
 +        "paddh      $f8,    $f8,    $f0             \n\t"                   \
 +        "pmullh     $f6,    $f6,    $f10            \n\t"                   \
 +        "pmullh     $f8,    $f8,    $f10            \n\t"                   \
 +        "psubh      $f6,    $f6,    $f2             \n\t"                   \
 +        "psubh      $f8,    $f8,    $f4             \n\t"                   \
 +        "li         $8,     0x07                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        NORMALIZE_MMI("$f16")                                               \
 +        /* Remove bias */                                                   \
 +        "paddh      $f6,    $f6,    $f12            \n\t"                   \
 +        "paddh      $f8,    $f8,    $f12            \n\t"                   \
 +        TRANSFER_DO_PACK(OP)                                                \
 +        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
 +        PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
 +        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
 +        "bnez       %[h],   1b                      \n\t"                   \
 +        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
 +          [h]"+r"(h),                                                       \
 +          [src]"+r"(src),               [dst]"+r"(dst)                      \
 +        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
 +          [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
 +        : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
 +          "$f16", "memory"                                                  \
 +    );                                                                      \
 +}
 +
 +VC1_HOR_16B_SHIFT2(OP_PUT, put_)
 +VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
 +
 +/**
 + * Purely vertical or horizontal 1/2 shift interpolation.
 + * Sacrify $f12 for *9 factor.
 + */
 +#define VC1_SHIFT2(OP, OPNAME)\
 +static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
 +                                     mips_reg stride, int rnd,              \
 +                                     mips_reg offset)                       \
 +{                                                                           \
 +    DECLARE_VAR_LOW32;                                                      \
 +    DECLARE_VAR_ADDRT;                                                      \
 +                                                                            \
 +    rnd = 8 - rnd;                                                          \
 +                                                                            \
 +    __asm__ volatile(                                                       \
 +        "xor        $f0,    $f0,    $f0             \n\t"                   \
 +        "li         $10,    0x08                    \n\t"                   \
 +        LOAD_ROUNDER_MMI("%[rnd]")                                          \
 +        "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
 +        "1:                                         \n\t"                   \
 +        MMI_ULWC1($f6, %[src], 0x00)                                        \
 +        MMI_ULWC1($f8, %[src], 0x04)                                        \
 +        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x00)                                            \
 +        MMI_ULWC1($f4, $9, 0x04)                                            \
 +        PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
 +        "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
 +        "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
 +        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
 +        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
 +        "paddh      $f6,    $f6,    $f2             \n\t"                   \
 +        "paddh      $f8,    $f8,    $f4             \n\t"                   \
 +        PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x00)                                            \
 +        MMI_ULWC1($f4, $9, 0x04)                                            \
 +        "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
 +        "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
 +        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
 +        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
 +        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
 +        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
 +        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x00)                                            \
 +        MMI_ULWC1($f4, $9, 0x04)                                            \
 +        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
 +        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
 +        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
 +        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
 +        "li         $8,     0x04                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        NORMALIZE_MMI("$f16")                                               \
 +        "packushb   $f6,    $f6,    $f8             \n\t"                   \
 +        OP((%[dst]), $f6)                                                   \
 +        "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
 +        "addiu      $10,    $10,   -0x01            \n\t"                   \
 +        PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
 +        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
 +        "bnez       $10,    1b                      \n\t"                   \
 +        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
 +          [src]"+r"(src),               [dst]"+r"(dst)                      \
 +        : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
 +          [stride]"g"(stride),          [rnd]"m"(rnd),                      \
 +          [stride1]"g"(stride-offset),                                      \
 +          [ff_pw_9]"m"(ff_pw_9)                                             \
 +        : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
 +          "$f12", "$f14", "$f16", "memory"                                  \
 +    );                                                                      \
 +}
 +
 +VC1_SHIFT2(OP_PUT, put_)
 +VC1_SHIFT2(OP_AVG, avg_)
 +
 +/**
 + * Core of the 1/4 and 3/4 shift bicubic interpolation.
 + *
 + * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
 + * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
 + * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
 + * @param A1      Stride address of 1st tap (beware of unpacked/packed).
 + * @param A2      Stride address of 2nd tap
 + * @param A3      Stride address of 3rd tap
 + * @param A4      Stride address of 4th tap
 + */
 +#define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
 +    PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
 +    LOAD($f2, $9, M*0)                                                      \
 +    LOAD($f4, $9, M*4)                                                      \
 +    UNPACK("$f2")                                                           \
 +    UNPACK("$f4")                                                           \
 +    "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
 +    "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
 +    PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
 +    LOAD($f6, $9, M*0)                                                      \
 +    LOAD($f8, $9, M*4)                                                      \
 +    UNPACK("$f6")                                                           \
 +    UNPACK("$f8")                                                           \
 +    "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
 +    "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
 +    "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
 +    "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
 +    PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
 +    LOAD($f2, $9, M*0)                                                      \
 +    LOAD($f4, $9, M*4)                                                      \
 +    UNPACK("$f2")                                                           \
 +    UNPACK("$f4")                                                           \
 +    "li         $8,     0x02                    \n\t"                       \
 +    "mtc1       $8,     $f16                    \n\t"                       \
 +    "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
 +    "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
 +    "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
 +    "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
 +    PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
 +    LOAD($f2, $9, M*0)                                                      \
 +    LOAD($f4, $9, M*4)                                                      \
 +    UNPACK("$f2")                                                           \
 +    UNPACK("$f4")                                                           \
 +    "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
 +    "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
 +    "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
 +    "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
 +
 +/**
 + * Macro to build the vertical 16bits version of vc1_put_shift[13].
 + * Here, offset=src_stride. Parameters passed A1 to A4 must use
 + * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
 + *
 + * @param  NAME   Either 1 or 3
 + * @see MSPEL_FILTER13_CORE for information on A1->A4
 + */
 +#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
 +static void                                                                 \
 +vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
 +                                 mips_reg src_stride,                       \
 +                                 int rnd, int64_t shift)                    \
 +{                                                                           \
 +    int h = 8;                                                              \
 +    DECLARE_VAR_LOW32;                                                      \
 +    DECLARE_VAR_ADDRT;                                                      \
 +                                                                            \
 +    src -= src_stride;                                                      \
 +                                                                            \
 +    __asm__ volatile(                                                       \
 +        "xor        $f0,    $f0,    $f0             \n\t"                   \
 +        LOAD_ROUNDER_MMI("%[rnd]")                                          \
 +        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
 +        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
 +        ".p2align 3                                 \n\t"                   \
 +        "1:                                         \n\t"                   \
 +        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
 +        NORMALIZE_MMI("%[shift]")                                           \
 +        TRANSFER_DONT_PACK(OP_PUT)                                          \
 +        /* Last 3 (in fact 4) bytes on the line */                          \
 +        PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x08)                                            \
 +        DO_UNPACK("$f2")                                                    \
 +        "mov.d      $f6,    $f2                     \n\t"                   \
 +        "paddh      $f2,    $f2,    $f2             \n\t"                   \
 +        "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
 +        PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
 +        MMI_ULWC1($f6, $9, 0x08)                                            \
 +        DO_UNPACK("$f6")                                                    \
 +        "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
 +        "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
 +        PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x08)                                            \
 +        DO_UNPACK("$f2")                                                    \
 +        "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
 +        "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
 +        PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
 +        MMI_ULWC1($f2, $9, 0x08)                                            \
 +        DO_UNPACK("$f2")                                                    \
 +        "li         $8,     0x02                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
 +        "psubh      $f6,    $f6,    $f2             \n\t"                   \
 +        "paddh      $f6,    $f6,    $f14            \n\t"                   \
 +        "li         $8,     0x06                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        "psrah      $f6,    $f6,    $f16            \n\t"                   \
 +        "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
 +        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
 +        PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
 +        PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
 +        "bnez       %[h],   1b                      \n\t"                   \
 +        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
 +          [h]"+r"(h),                                                       \
 +          [src]"+r"(src),               [dst]"+r"(dst)                      \
 +        : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
 +          [stride_x3]"r"(3*src_stride),                                     \
 +          [rnd]"m"(rnd),                [shift]"f"(shift),                  \
 +          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
 +          [ff_pw_3]"f"(ff_pw_3)                                             \
 +        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
 +          "$f14", "$f16", "memory"                                          \
 +    );                                                                      \
 +}
 +
 +/**
 + * Macro to build the horizontal 16bits version of vc1_put_shift[13].
 + * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
 + *
 + * @param  NAME   Either 1 or 3
 + * @see MSPEL_FILTER13_CORE for information on A1->A4
 + */
 +#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
 +static void                                                                 \
 +OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
 +                                       const int16_t *src, int rnd)         \
 +{                                                                           \
 +    int h = 8;                                                              \
 +    DECLARE_VAR_ALL64;                                                      \
 +    DECLARE_VAR_ADDRT;                                                      \
 +                                                                            \
 +    src -= 1;                                                               \
 +    rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
 +                                                                            \
 +    __asm__ volatile(                                                       \
 +        "xor        $f0,    $f0,    $f0             \n\t"                   \
 +        LOAD_ROUNDER_MMI("%[rnd]")                                          \
 +        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
 +        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
 +        ".p2align 3                                 \n\t"                   \
 +        "1:                                         \n\t"                   \
 +        MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
 +        "li         $8,     0x07                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        NORMALIZE_MMI("$f16")                                               \
 +        /* Remove bias */                                                   \
 +        "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
 +        "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
 +        TRANSFER_DO_PACK(OP)                                                \
 +        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
 +        PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
 +        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
 +        "bnez       %[h],   1b                      \n\t"                   \
 +        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
 +          [h]"+r"(h),                                                       \
 +          [src]"+r"(src),               [dst]"+r"(dst)                      \
 +        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
 +          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
 +          [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
 +        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
 +          "$f14", "$f16", "memory"                                          \
 +    );                                                                      \
 +}
 +
 +/**
 + * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
 + * Here, offset=src_stride. Parameters passed A1 to A4 must use
 + * %3 (offset), %4 (2*offset) and %5 (3*offset).
 + *
 + * @param  NAME   Either 1 or 3
 + * @see MSPEL_FILTER13_CORE for information on A1->A4
 + */
 +#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
 +static void                                                                 \
 +OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
 +                              mips_reg stride, int rnd, mips_reg offset)    \
 +{                                                                           \
 +    int h = 8;                                                              \
 +    DECLARE_VAR_LOW32;                                                      \
 +    DECLARE_VAR_ADDRT;                                                      \
 +                                                                            \
 +    src -= offset;                                                          \
 +    rnd = 32-rnd;                                                           \
 +                                                                            \
 +    __asm__ volatile (                                                      \
 +        "xor        $f0,    $f0,    $f0             \n\t"                   \
 +        LOAD_ROUNDER_MMI("%[rnd]")                                          \
 +        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
 +        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
 +        ".p2align 3                                 \n\t"                   \
 +        "1:                                         \n\t"                   \
 +        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
 +        "li         $8,     0x06                    \n\t"                   \
 +        "mtc1       $8,     $f16                    \n\t"                   \
 +        NORMALIZE_MMI("$f16")                                               \
 +        TRANSFER_DO_PACK(OP)                                                \
 +        "addiu      %[h],   %[h],      -0x01        \n\t"                   \
 +        PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
 +        PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
 +        "bnez       %[h],   1b                      \n\t"                   \
 +        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
 +          [h]"+r"(h),                                                       \
 +          [src]"+r"(src),               [dst]"+r"(dst)                      \
 +        : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
 +          [offset_x3]"r"(3*offset),     [stride]"g"(stride),                \
 +          [rnd]"m"(rnd),                                                    \
 +          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
 +          [ff_pw_3]"f"(ff_pw_3)                                             \
 +        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
 +          "$f14", "$f16", "memory"                                          \
 +    );                                                                      \
 +}
 +
 +
 +/** 1/4 shift bicubic interpolation */
 +MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
 +MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
 +MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
 +MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
 +MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
 +
 +/** 3/4 shift bicubic interpolation */
 +MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
 +MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
 +MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
 +MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
 +MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
 +
 +typedef void (*vc1_mspel_mc_filter_ver_16bits)
 +             (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
 +              int64_t shift);
 +typedef void (*vc1_mspel_mc_filter_hor_16bits)
 +             (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
 +typedef void (*vc1_mspel_mc_filter_8bits)
 +             (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
 +              mips_reg offset);
 +
 +/**
 + * Interpolate fractional pel values by applying proper vertical then
 + * horizontal filter.
 + *
 + * @param  dst     Destination buffer for interpolated pels.
 + * @param  src     Source buffer.
 + * @param  stride  Stride for both src and dst buffers.
 + * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
 + * @param  hmode   Vertical filter.
 + * @param  rnd     Rounding bias.
 + */
 +#define VC1_MSPEL_MC(OP)                                                    \
 +static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 +                               int hmode, int vmode, int rnd)               \
 +{                                                                           \
 +    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
 +         { NULL, vc1_put_ver_16b_shift1_mmi,                                \
 +                 vc1_put_ver_16b_shift2_mmi,                                \
 +                 vc1_put_ver_16b_shift3_mmi };                              \
 +    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
 +         { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
 +                 OP ## vc1_hor_16b_shift2_mmi,                              \
 +                 OP ## vc1_hor_16b_shift3_mmi };                            \
 +    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
 +         { NULL, OP ## vc1_shift1_mmi,                                      \
 +                 OP ## vc1_shift2_mmi,                                      \
 +                 OP ## vc1_shift3_mmi };                                    \
 +                                                                            \
 +    if (vmode) { /* Vertical filter to apply */                             \
 +        if (hmode) { /* Horizontal filter to apply, output to tmp */        \
 +            static const int shift_value[] = { 0, 5, 1, 5 };                \
 +            int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
 +            int    r;                                                       \
 +            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
 +                                                                            \
 +            r = (1<<(shift-1)) + rnd-1;                                     \
 +            vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
 +                                                                            \
 +            vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
 +            return;                                                         \
 +        }                                                                   \
 +        else { /* No horizontal filter, output 8 lines to dst */            \
 +            vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
 +            return;                                                         \
 +        }                                                                   \
 +    }                                                                       \
 +                                                                            \
 +    /* Horizontal mode with no vertical mode */                             \
 +    vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
 +}                                                                           \
 +static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
 +                                  int stride, int hmode, int vmode, int rnd)\
 +{                                                                           \
 +    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
 +    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
 +    dst += 8*stride; src += 8*stride;                                       \
 +    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
 +    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
 +}
 +
 +VC1_MSPEL_MC(put_)
 +VC1_MSPEL_MC(avg_)
 +
 +/** Macro to ease bicubic filter interpolation functions declarations */
 +#define DECLARE_FUNCTION(a, b)                                              \
 +void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
 +                                           const uint8_t *src,              \
 +                                           ptrdiff_t stride,                \
 +                                           int rnd)                         \
 +{                                                                           \
 +     put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
 +}                                                                           \
 +void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
 +                                           const uint8_t *src,              \
 +                                           ptrdiff_t stride,                \
 +                                           int rnd)                         \
 +{                                                                           \
 +     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
 +}                                                                           \
 +void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
 +                                              const uint8_t *src,           \
 +                                              ptrdiff_t stride,             \
 +                                              int rnd)                      \
 +{                                                                           \
 +     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
 +}                                                                           \
 +void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
 +                                              const uint8_t *src,           \
 +                                              ptrdiff_t stride,             \
 +                                              int rnd)                      \
 +{                                                                           \
 +     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
 +}
 +
 +DECLARE_FUNCTION(0, 1)
 +DECLARE_FUNCTION(0, 2)
 +DECLARE_FUNCTION(0, 3)
 +
 +DECLARE_FUNCTION(1, 0)
 +DECLARE_FUNCTION(1, 1)
 +DECLARE_FUNCTION(1, 2)
 +DECLARE_FUNCTION(1, 3)
 +
 +DECLARE_FUNCTION(2, 0)
 +DECLARE_FUNCTION(2, 1)
 +DECLARE_FUNCTION(2, 2)
 +DECLARE_FUNCTION(2, 3)
 +
 +DECLARE_FUNCTION(3, 0)
 +DECLARE_FUNCTION(3, 1)
 +DECLARE_FUNCTION(3, 2)
 +DECLARE_FUNCTION(3, 3)
 +
 +#define CHROMA_MC_8_MMI                                                     \
 +        "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
 +        "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
 +        "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
 +        "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
 +                                                                            \
 +        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
 +        "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
 +        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
 +        "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
 +        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
 +        "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
 +        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
 +        "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
 +                                                                            \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
 +        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
 +                                                                            \
 +        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
 +        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
 +        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
 +        "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
 +                                                                            \
 +        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
 +        "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
 +        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
 +
 +
 +#define CHROMA_MC_4_MMI                                                     \
 +        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
 +        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
 +                                                                            \
 +        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
 +        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
 +        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
 +        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
 +                                                                            \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
 +        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
 +        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
 +                                                                            \
 +        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
 +        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
 +
 +
 +void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B =     (x) * (8 - y);
 +    const int C = (8 - x) *     (y);
 +    const int D =     (x) *     (y);
 +    double ftmp[10];
 +    uint32_t tmp[1];
 +    DECLARE_VAR_ALL64;
 +    DECLARE_VAR_ADDRT;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    __asm__ volatile(
 +        "li         %[tmp0],    0x06                                    \n\t"
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
 +        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
 +        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
 +        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
 +        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +        MMI_ULDC1(%[ftmp2], %[src], 0x01)
 +        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
 +        MMI_ULDC1(%[ftmp3], %[src], 0x00)
 +        MMI_ULDC1(%[ftmp4], %[src], 0x01)
 +
 +        CHROMA_MC_8_MMI
 +
 +        MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +        "addiu      %[h],       %[h],      -0x01                        \n\t"
 +        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
 +        "bnez       %[h],       1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          RESTRICT_ASM_ALL64
 +          RESTRICT_ASM_ADDRT
 +          [tmp0]"=&r"(tmp[0]),
 +          [src]"+&r"(src),              [dst]"+&r"(dst),
 +          [h]"+&r"(h)
 +        : [stride]"r"((mips_reg)stride),
 +          [A]"f"(A),                    [B]"f"(B),
 +          [C]"f"(C),                    [D]"f"(D),
 +          [ff_pw_28]"f"(ff_pw_28)
 +        : "memory"
 +    );
 +}
 +
 +void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B =     (x) * (8 - y);
 +    const int C = (8 - x) *     (y);
 +    const int D =     (x) *     (y);
 +    double ftmp[6];
 +    uint32_t tmp[1];
 +    DECLARE_VAR_LOW32;
 +    DECLARE_VAR_ADDRT;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    __asm__ volatile(
 +        "li         %[tmp0],    0x06                                    \n\t"
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
 +        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
 +        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
 +        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
 +        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +        MMI_ULWC1(%[ftmp2], %[src], 0x01)
 +        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
 +        MMI_ULWC1(%[ftmp3], %[src], 0x00)
 +        MMI_ULWC1(%[ftmp4], %[src], 0x01)
 +
 +        CHROMA_MC_4_MMI
 +
 +        MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +        "addiu      %[h],       %[h],      -0x01                        \n\t"
 +        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
 +        "bnez       %[h],       1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_LOW32
 +          RESTRICT_ASM_ADDRT
 +          [src]"+&r"(src),              [dst]"+&r"(dst),
 +          [h]"+&r"(h)
 +        : [stride]"r"((mips_reg)stride),
 +          [A]"f"(A),                    [B]"f"(B),
 +          [C]"f"(C),                    [D]"f"(D),
 +          [ff_pw_28]"f"(ff_pw_28)
 +        : "memory"
 +    );
 +}
 +
 +void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B =     (x) * (8 - y);
 +    const int C = (8 - x) *     (y);
 +    const int D =     (x) *     (y);
 +    double ftmp[10];
 +    uint32_t tmp[1];
 +    DECLARE_VAR_ALL64;
 +    DECLARE_VAR_ADDRT;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    __asm__ volatile(
 +        "li         %[tmp0],    0x06                                    \n\t"
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
 +        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
 +        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
 +        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
 +        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_ULDC1(%[ftmp1], %[src], 0x00)
 +        MMI_ULDC1(%[ftmp2], %[src], 0x01)
 +        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
 +        MMI_ULDC1(%[ftmp3], %[src], 0x00)
 +        MMI_ULDC1(%[ftmp4], %[src], 0x01)
 +
 +        CHROMA_MC_8_MMI
 +
 +        MMI_LDC1(%[ftmp2], %[dst], 0x00)
 +        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
 +
 +        MMI_SDC1(%[ftmp1], %[dst], 0x00)
 +        "addiu      %[h],       %[h],      -0x01                        \n\t"
 +        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
 +        "bnez       %[h],       1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 +          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_ALL64
 +          RESTRICT_ASM_ADDRT
 +          [src]"+&r"(src),              [dst]"+&r"(dst),
 +          [h]"+&r"(h)
 +        : [stride]"r"((mips_reg)stride),
 +          [A]"f"(A),                    [B]"f"(B),
 +          [C]"f"(C),                    [D]"f"(D),
 +          [ff_pw_28]"f"(ff_pw_28)
 +        : "memory"
 +    );
 +}
 +
 +void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
 +                                      uint8_t *src /* align 1 */,
 +                                      int stride, int h, int x, int y)
 +{
 +    const int A = (8 - x) * (8 - y);
 +    const int B = (    x) * (8 - y);
 +    const int C = (8 - x) * (    y);
 +    const int D = (    x) * (    y);
 +    double ftmp[6];
 +    uint32_t tmp[1];
 +    DECLARE_VAR_LOW32;
 +    DECLARE_VAR_ADDRT;
 +
 +    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 +
 +    __asm__ volatile(
 +        "li         %[tmp0],    0x06                                    \n\t"
 +        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 +        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
 +        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
 +        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
 +        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
 +        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
 +
 +        "1:                                                             \n\t"
 +        MMI_ULWC1(%[ftmp1], %[src], 0x00)
 +        MMI_ULWC1(%[ftmp2], %[src], 0x01)
 +        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
 +        MMI_ULWC1(%[ftmp3], %[src], 0x00)
 +        MMI_ULWC1(%[ftmp4], %[src], 0x01)
 +
 +        CHROMA_MC_4_MMI
 +
 +        MMI_LWC1(%[ftmp2], %[dst], 0x00)
 +        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
 +
 +        MMI_SWC1(%[ftmp1], %[dst], 0x00)
 +        "addiu      %[h],       %[h],      -0x01                        \n\t"
 +        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
 +        "bnez       %[h],       1b                                      \n\t"
 +        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 +          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 +          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 +          [tmp0]"=&r"(tmp[0]),
 +          RESTRICT_ASM_LOW32
 +          RESTRICT_ASM_ADDRT
 +          [src]"+&r"(src),              [dst]"+&r"(dst),
 +          [h]"+&r"(h)
 +        : [stride]"r"((mips_reg)stride),
 +          [A]"f"(A),                    [B]"f"(B),
 +          [C]"f"(C),                    [D]"f"(D),
 +          [ff_pw_28]"f"(ff_pw_28)
 +        : "memory"
 +    );
 +}
diff --cc libavcodec/ppc/idctdsp.c
index 80e71fd,0aaaac0..f1b4247
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@@ -153,23 -153,7 +153,23 @@@ static const vec_s16 constants[5] = 
      { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
  };
  
 +static void idct_altivec(int16_t *blk)
 +{
 +    vec_s16 *block = (vec_s16 *) blk;
 +
 +    IDCT;
 +
 +    block[0] = vx0;
 +    block[1] = vx1;
 +    block[2] = vx2;
 +    block[3] = vx3;
 +    block[4] = vx4;
 +    block[5] = vx5;
 +    block[6] = vx6;
 +    block[7] = vx7;
 +}
 +
- static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
+ static void idct_put_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
  {
      vec_s16 *block = (vec_s16 *) blk;
      vec_u8 tmp;
diff --cc libavcodec/simple_idct.h
index 154e297,edc994d..2a5e1d7
--- a/libavcodec/simple_idct.h
+++ b/libavcodec/simple_idct.h
@@@ -28,20 -28,16 +28,21 @@@
  #ifndef AVCODEC_SIMPLE_IDCT_H
  #define AVCODEC_SIMPLE_IDCT_H
  
+ #include <stddef.h>
  #include <stdint.h>
  
- void ff_simple_idct_put_8(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct_add_8(uint8_t *dest, int line_size, int16_t *block);
+ void ff_simple_idct_put_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_simple_idct_add_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  void ff_simple_idct_8(int16_t *block);
  
- void ff_simple_idct_put_10(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct_add_10(uint8_t *dest, int line_size, int16_t *block);
+ void ff_simple_idct_put_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_simple_idct_add_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  void ff_simple_idct_10(int16_t *block);
 +
- void ff_simple_idct_put_12(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct_add_12(uint8_t *dest, int line_size, int16_t *block);
++void ff_simple_idct_put_12(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
++void ff_simple_idct_add_12(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 +void ff_simple_idct_12(int16_t *block);
 +
  /**
   * Special version of ff_simple_idct_10() which does dequantization
   * and scales by a factor of 2 more between the two IDCTs to account
diff --cc libavcodec/simple_idct_template.c
index c669767,d10df31..f532313
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@@ -247,34 -222,31 +247,34 @@@ static inline void FUNC(idctRowCondDC)(
          }                                               \
      } while (0)
  
 +#ifdef EXTRA_SHIFT
 +static inline void FUNC(idctSparseCol_extrashift)(int16_t *col)
 +#else
- static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size,
+ static inline void FUNC(idctSparseColPut)(pixel *dest, ptrdiff_t line_size,
                                            int16_t *col)
  {
 -    int a0, a1, a2, a3, b0, b1, b2, b3;
 +    SUINT a0, a1, a2, a3, b0, b1, b2, b3;
  
      IDCT_COLS;
  
 -    dest[0] = av_clip_pixel((a0 + b0) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a1 + b1) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a2 + b2) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a3 + b3) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a3 - b3) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a2 - b2) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a1 - b1) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT);
      dest += line_size;
 -    dest[0] = av_clip_pixel((a0 - b0) >> COL_SHIFT);
 +    dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT);
  }
  
- static inline void FUNC(idctSparseColAdd)(pixel *dest, int line_size,
+ static inline void FUNC(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
                                            int16_t *col)
  {
      int a0, a1, a2, a3, b0, b1, b2, b3;
@@@ -315,8 -286,7 +315,8 @@@ static inline void FUNC(idctSparseCol)(
      col[56] = ((a0 - b0) >> COL_SHIFT);
  }
  
 +#ifndef EXTRA_SHIFT
- void FUNC(ff_simple_idct_put)(uint8_t *dest_, int line_size, int16_t *block)
+ void FUNC(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
  {
      pixel *dest = (pixel *)dest_;
      int i;
diff --cc libavcodec/x86/idctdsp.h
index daa4e79,6e6c688..e2e296a
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@@ -19,20 -19,14 +19,22 @@@
  #ifndef AVCODEC_X86_IDCTDSP_H
  #define AVCODEC_X86_IDCTDSP_H
  
+ #include <stddef.h>
  #include <stdint.h>
 +#include <stddef.h>
  
  void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                 ptrdiff_t line_size);
 +void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
 +                                ptrdiff_t line_size);
  void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                 ptrdiff_t line_size);
 +void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
 +                                ptrdiff_t line_size);
  void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                        ptrdiff_t line_size);
 +void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
 +                                       ptrdiff_t line_size);
 +
+ 
  #endif /* AVCODEC_X86_IDCTDSP_H */
diff --cc libavcodec/x86/simple_idct.h
index 8eeb31e,15784a9..ad76baf
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@@ -22,19 -23,7 +23,19 @@@
  #include <stdint.h>
  
  void ff_simple_idct_mmx(int16_t *block);
- void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
+ void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  
 +void ff_simple_idct10_sse2(int16_t *block);
 +void ff_simple_idct10_avx(int16_t *block);
 +
- void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block);
++void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
++void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 +
 +void ff_simple_idct12_sse2(int16_t *block);
 +void ff_simple_idct12_avx(int16_t *block);
 +
- void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block);
- void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block);
++void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
++void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 +
  #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --cc libavcodec/x86/vc1dsp_init.c
index c8943fa,aff4b26..e05ae06
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@@ -92,14 -80,6 +92,14 @@@ void ff_put_vc1_chroma_mc8_nornd_ssse3(
                                         int stride, int h, int x, int y);
  void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                         int stride, int h, int x, int y);
- void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
++void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
- void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
++void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
- void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
++void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
- void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
++void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
 +                                    int16_t *block);
  
  
  av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
diff --cc libavcodec/x86/vc1dsp_mc.asm
index 175c397,0000000..2850ca8
mode 100644,000000..100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@@ -1,292 -1,0 +1,292 @@@
 +;******************************************************************************
 +;* VC1 motion compensation optimizations
 +;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
 +cextern pw_9
 +cextern pw_128
 +
 +section .text
 +
 +%if HAVE_MMX_INLINE
 +
 +; XXX some of these macros are not used right now, but they will in the future
 +;     when more functions are ported.
 +
 +%macro OP_PUT 2 ; dst, src
 +%endmacro
 +
 +%macro OP_AVG 2 ; dst, src
 +    pavgb           %1, %2
 +%endmacro
 +
 +%macro NORMALIZE_MMX 1 ; shift
 +    paddw           m3, m7 ; +bias-r
 +    paddw           m4, m7 ; +bias-r
 +    psraw           m3, %1
 +    psraw           m4, %1
 +%endmacro
 +
 +%macro TRANSFER_DO_PACK 2 ; op, dst
 +    packuswb        m3, m4
 +    %1              m3, [%2]
 +    mova          [%2], m3
 +%endmacro
 +
 +%macro TRANSFER_DONT_PACK 2 ; op, dst
 +    %1              m3, [%2]
 +    %1              m3, [%2 + mmsize]
 +    mova          [%2], m3
 +    mova [mmsize + %2], m4
 +%endmacro
 +
 +; see MSPEL_FILTER13_CORE for use as UNPACK macro
 +%macro DO_UNPACK 1 ; reg
 +    punpcklbw       %1, m0
 +%endmacro
 +%macro DONT_UNPACK 1 ; reg
 +%endmacro
 +
 +; Compute the rounder 32-r or 8-r and unpacks it to m7
 +%macro LOAD_ROUNDER_MMX 1 ; round
 +    movd      m7, %1
 +    punpcklwd m7, m7
 +    punpckldq m7, m7
 +%endmacro
 +
 +%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
 +    paddw          m%3, m%4
 +    movh           m%2, [srcq + stride_neg2]
 +    pmullw         m%3, m6
 +    punpcklbw      m%2, m0
 +    movh           m%5, [srcq + strideq]
 +    psubw          m%3, m%2
 +    punpcklbw      m%5, m0
 +    paddw          m%3, m7
 +    psubw          m%3, m%5
 +    psraw          m%3, shift
 +    movu   [dstq + %1], m%3
 +    add           srcq, strideq
 +%endmacro
 +
 +INIT_MMX mmx
 +; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
 +;                                    x86_reg stride, int rnd, int64_t shift)
 +; Sacrificing m6 makes it possible to pipeline loads from src
 +%if ARCH_X86_32
 +cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
 +    DECLARE_REG_TMP     3, 4, 5
 +    %define rnd r3mp
 +    %define shift qword r4m
 +%else ; X86_64
 +cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
 +    DECLARE_REG_TMP     4, 5, 6
 +    %define   rnd r3d
 +    ; We need shift either in memory or in a mm reg as it's used in psraw
 +    ; On WIN64, the arg is already on the stack
 +    ; On UNIX64, m5 doesn't seem to be used
 +%if WIN64
 +    %define shift r4mp
 +%else ; UNIX64
 +    %define shift m5
 +    mova shift, r4q
 +%endif ; WIN64
 +%endif ; X86_32
 +%define stride_neg2 t0q
 +%define stride_9minus4 t1q
 +%define i t2q
 +    mov       stride_neg2, strideq
 +    neg       stride_neg2
 +    add       stride_neg2, stride_neg2
 +    lea    stride_9minus4, [strideq * 9 - 4]
 +    mov                 i, 3
 +    LOAD_ROUNDER_MMX  rnd
 +    mova               m6, [pw_9]
 +    pxor               m0, m0
 +.loop:
 +    movh               m2, [srcq]
 +    add              srcq, strideq
 +    movh               m3, [srcq]
 +    punpcklbw          m2, m0
 +    punpcklbw          m3, m0
 +    SHIFT2_LINE         0, 1, 2, 3, 4
 +    SHIFT2_LINE        24, 2, 3, 4, 1
 +    SHIFT2_LINE        48, 3, 4, 1, 2
 +    SHIFT2_LINE        72, 4, 1, 2, 3
 +    SHIFT2_LINE        96, 1, 2, 3, 4
 +    SHIFT2_LINE       120, 2, 3, 4, 1
 +    SHIFT2_LINE       144, 3, 4, 1, 2
 +    SHIFT2_LINE       168, 4, 1, 2, 3
 +    sub              srcq, stride_9minus4
 +    add              dstq, 8
 +    dec                 i
 +        jnz         .loop
 +    REP_RET
 +%undef rnd
 +%undef shift
 +%undef stride_neg2
 +%undef stride_9minus4
 +%undef i
 +
 +; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
 +;                                  const int16_t *src, int rnd);
 +; Data is already unpacked, so some operations can directly be made from
 +; memory.
 +%macro HOR_16B_SHIFT2 2 ; op, opname
 +cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
 +    mov                hq, 8
 +    sub              srcq, 2
 +    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
 +    LOAD_ROUNDER_MMX rndd
 +    mova               m5, [pw_9]
 +    mova               m6, [pw_128]
 +    pxor               m0, m0
 +
 +.loop:
 +    mova               m1, [srcq + 2 * 0]
 +    mova               m2, [srcq + 2 * 0 + mmsize]
 +    mova               m3, [srcq + 2 * 1]
 +    mova               m4, [srcq + 2 * 1 + mmsize]
 +    paddw              m3, [srcq + 2 * 2]
 +    paddw              m4, [srcq + 2 * 2 + mmsize]
 +    paddw              m1, [srcq + 2 * 3]
 +    paddw              m2, [srcq + 2 * 3 + mmsize]
 +    pmullw             m3, m5
 +    pmullw             m4, m5
 +    psubw              m3, m1
 +    psubw              m4, m2
 +    NORMALIZE_MMX      7
 +    ; remove bias
 +    paddw              m3, m6
 +    paddw              m4, m6
 +    TRANSFER_DO_PACK   %1, dstq
 +    add              srcq, 24
 +    add              dstq, strideq
 +    dec                hq
 +        jnz         .loop
 +
 +    RET
 +%endmacro
 +
 +INIT_MMX mmx
 +HOR_16B_SHIFT2 OP_PUT, put
 +
 +INIT_MMX mmxext
 +HOR_16B_SHIFT2 OP_AVG, avg
 +%endif ; HAVE_MMX_INLINE
 +
 +%macro INV_TRANS_INIT 0
 +    movsxdifnidn linesizeq, linesized
 +    movd       m0, blockd
 +    SPLATW     m0, m0
 +    pxor       m1, m1
 +    psubw      m1, m0
 +    packuswb   m0, m0
 +    packuswb   m1, m1
 +
 +    DEFINE_ARGS dest, linesize, linesize3
 +    lea    linesize3q, [linesizeq*3]
 +%endmacro
 +
 +%macro INV_TRANS_PROCESS 1
 +    mov%1                  m2, [destq+linesizeq*0]
 +    mov%1                  m3, [destq+linesizeq*1]
 +    mov%1                  m4, [destq+linesizeq*2]
 +    mov%1                  m5, [destq+linesize3q]
 +    paddusb                m2, m0
 +    paddusb                m3, m0
 +    paddusb                m4, m0
 +    paddusb                m5, m0
 +    psubusb                m2, m1
 +    psubusb                m3, m1
 +    psubusb                m4, m1
 +    psubusb                m5, m1
 +    mov%1 [linesizeq*0+destq], m2
 +    mov%1 [linesizeq*1+destq], m3
 +    mov%1 [linesizeq*2+destq], m4
 +    mov%1 [linesize3q +destq], m5
 +%endmacro
 +
- ; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
++; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
 +    movsx         r3d, WORD [blockq]
 +    mov        blockd, r3d             ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
 +    sar        blockd, 3               ; >> 3
 +    mov           r3d, blockd          ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS h
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
 +    movsx         r3d, WORD [blockq]
 +    mov        blockd, r3d             ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
 +    sar        blockd, 3               ; >> 3
 +    shl        blockd, 2               ;  4 * dc
 +    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS h
 +    lea         destq, [destq+linesizeq*4]
 +    INV_TRANS_PROCESS h
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
 +    movsx      blockd, WORD [blockq]   ; dc
 +    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
 +    sar        blockd, 1               ; >> 1
 +    mov           r3d, blockd          ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS a
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
 +    movsx      blockd, WORD [blockq]   ; dc
 +    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
 +    sar        blockd, 1               ; >> 1
 +    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
 +    sar        blockd, 5               ; >> 5
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS a
 +    lea         destq, [destq+linesizeq*4]
 +    INV_TRANS_PROCESS a
 +    RET
diff --cc libavcodec/x86/xvididct.h
index 573b25c,6640b6b..edb5ebf
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@@ -29,15 -30,15 +30,15 @@@
  #include <stdint.h>
  
  void ff_xvid_idct_mmx(short *block);
- void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block);
- void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block);
+ void ff_xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  
  void ff_xvid_idct_mmxext(short *block);
- void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
- void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
+ void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+ void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
  
  void ff_xvid_idct_sse2(short *block);
- void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
- void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
 -void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block);
 -void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block);
++void ff_xvid_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
++void ff_xvid_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
  
  #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --cc libavcodec/x86/xvididct_init.c
index 8b9d8de,e4f7345..fd10953
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@@ -26,32 -26,6 +26,32 @@@
  #include "idctdsp.h"
  #include "xvididct.h"
  
 +#if ARCH_X86_32 && HAVE_YASM
- static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
++static void xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, short *block)
 +{
 +    ff_xvid_idct_mmx(block);
 +    ff_put_pixels_clamped(block, dest, line_size);
 +}
 +
- static void xvid_idct_mmx_add(uint8_t *dest, int line_size, short *block)
++static void xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, short *block)
 +{
 +    ff_xvid_idct_mmx(block);
 +    ff_add_pixels_clamped(block, dest, line_size);
 +}
 +
- static void xvid_idct_mmxext_put(uint8_t *dest, int line_size, short *block)
++static void xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, short *block)
 +{
 +    ff_xvid_idct_mmxext(block);
 +    ff_put_pixels_clamped(block, dest, line_size);
 +}
 +
- static void xvid_idct_mmxext_add(uint8_t *dest, int line_size, short *block)
++static void xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, short *block)
 +{
 +    ff_xvid_idct_mmxext(block);
 +    ff_add_pixels_clamped(block, dest, line_size);
 +}
 +#endif
 +
  av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                     unsigned high_bit_depth)
  {



More information about the ffmpeg-cvslog mailing list