[FFmpeg-cvslog] Merge commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c'
James Almer
git at videolan.org
Tue Mar 21 20:27:07 EET 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Mar 21 15:20:45 2017 -0300| [a8474df9447d6466c77d3ec8f414cda2662f057b] | committer: James Almer
Merge commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c'
* commit 'e4a94d8b36c48d95a7d412c40d7b558422ff659c':
h264chroma: Change type of stride parameters to ptrdiff_t
Merged-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a8474df9447d6466c77d3ec8f414cda2662f057b
---
libavcodec/aarch64/h264chroma_init_aarch64.c | 12 ++++++------
libavcodec/aarch64/h264cmc_neon.S | 7 ++-----
libavcodec/aarch64/rv40dsp_init_aarch64.c | 16 ++++++++--------
libavcodec/aarch64/vc1dsp_init_aarch64.c | 16 ++++++++--------
libavcodec/arm/h264chroma_init_arm.c | 18 ++++++++++++------
libavcodec/arm/h264cmc_neon.S | 4 ++--
libavcodec/arm/vc1dsp_init_neon.c | 16 ++++++++--------
libavcodec/h264chroma.h | 3 ++-
libavcodec/h264chroma_template.c | 21 +++++++++++++--------
libavcodec/mips/h264chroma_mips.h | 20 ++++++++++----------
libavcodec/mips/h264chroma_mmi.c | 8 ++++----
libavcodec/mips/h264chroma_msa.c | 12 ++++++------
libavcodec/ppc/h264chroma_template.c | 9 +++++++--
libavcodec/rv40dsp.c | 14 ++++++++++----
libavcodec/vc1dsp.c | 8 ++++----
libavcodec/x86/h264_chromamc.asm | 18 +-----------------
libavcodec/x86/h264_chromamc_10bit.asm | 15 ++++++---------
libavcodec/x86/h264chroma_init.c | 26 +++++++++++++-------------
libavcodec/x86/rv40dsp_init.c | 12 ++++++------
libavcodec/x86/vc1dsp_init.c | 10 +++++-----
20 files changed, 133 insertions(+), 132 deletions(-)
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index 2af62be..fa6e0ea 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -28,18 +28,18 @@
#include "config.h"
-void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index ff97a29..8be7578 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -21,10 +21,9 @@
#include "libavutil/aarch64/asm.S"
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
- sxtw x2, w2
.ifc \type,avg
mov x8, x0
.endif
@@ -192,10 +191,9 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
endfunc
.endm
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
- sxtw x2, w2
.ifc \type,avg
mov x8, x0
.endif
@@ -359,7 +357,6 @@ endfunc
.macro h264_chroma_mc2 type
function ff_\type\()_h264_chroma_mc2_neon, export=1
- sxtw x2, w2
prfm pldl1strm, [x1]
prfm pldl1strm, [x1, x2]
orr w7, w4, w5
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index 764bc1e..142705d 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -25,15 +25,15 @@
#include "config.h"
-void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
+void ff_put_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
av_cold void ff_rv40dsp_init_aarch64(RV34DSPContext *c)
{
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index e59e55e..13dfd74 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,14 +25,14 @@
#include "config.h"
-void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
{
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 13f7e0d..aae804b 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -26,13 +26,19 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264chroma.h"
-void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
-void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
{
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index fc48a6f..5a4159e 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -20,7 +20,7 @@
#include "libavutil/arm/asm.S"
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
push {r4-r7, lr}
@@ -195,7 +195,7 @@ T cmp r7, #0
endfunc
.endm
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
push {r4-r7, lr}
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index c340144..005d45c 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -70,14 +70,14 @@ DECL_PUT(3, 1)
DECL_PUT(3, 2)
DECL_PUT(3, 3)
-void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
-void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
#define FN_ASSIGN(X, Y) \
dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index e0f45ad..5c89fd1 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -19,9 +19,10 @@
#ifndef AVCODEC_H264CHROMA_H
#define AVCODEC_H264CHROMA_H
+#include <stddef.h>
#include <stdint.h>
-typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y);
typedef struct H264ChromaContext {
h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index 072b5e0..a3ca07b 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -19,12 +19,14 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "libavutil/avassert.h"
+#include <stddef.h>
+
+#include "libavutil/avassert.h"
#include "bit_depth_template.c"
#define H264_CHROMA_MC(OPNAME, OP)\
-static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
pixel *dst = (pixel*)_dst;\
pixel *src = (pixel*)_src;\
const int A=(8-x)*(8-y);\
@@ -58,7 +60,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *
}\
}\
}\
-static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
pixel *dst = (pixel*)_dst;\
pixel *src = (pixel*)_src;\
const int A=(8-x)*(8-y);\
@@ -79,7 +82,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
}\
} else if (B + C) {\
const int E= B+C;\
- const int step= C ? stride : 1;\
+ const ptrdiff_t step = C ? stride : 1;\
for(i=0; i<h; i++){\
OP(dst[0], (A*src[0] + E*src[step+0]));\
OP(dst[1], (A*src[1] + E*src[step+1]));\
@@ -96,7 +99,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
}\
}\
\
-static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
pixel *dst = (pixel*)_dst;\
pixel *src = (pixel*)_src;\
const int A=(8-x)*(8-y);\
@@ -119,7 +123,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
}\
} else if (B + C) {\
const int E= B+C;\
- const int step= C ? stride : 1;\
+ const ptrdiff_t step = C ? stride : 1;\
for(i=0; i<h; i++){\
OP(dst[0], (A*src[0] + E*src[step+0]));\
OP(dst[1], (A*src[1] + E*src[step+1]));\
@@ -140,7 +144,8 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
}\
}\
\
-static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
pixel *dst = (pixel*)_dst;\
pixel *src = (pixel*)_src;\
const int A=(8-x)*(8-y);\
@@ -167,7 +172,7 @@ static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *
}\
} else if (B + C) {\
const int E= B+C;\
- const int step= C ? stride : 1;\
+ const ptrdiff_t step = C ? stride : 1;\
for(i=0; i<h; i++){\
OP(dst[0], (A*src[0] + E*src[step+0]));\
OP(dst[1], (A*src[1] + E*src[step+1]));\
diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h
index 6e6127d..996384d 100644
--- a/libavcodec/mips/h264chroma_mips.h
+++ b/libavcodec/mips/h264chroma_mips.h
@@ -22,26 +22,26 @@
#define AVCODEC_MIPS_H264CHROMA_MIPS_H
#include "libavcodec/h264dec.h"
-void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int x, int y);
-void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
-void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
index 417b4a2..bafe0f9 100644
--- a/libavcodec/mips/h264chroma_mmi.c
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -26,7 +26,7 @@
#include "constants.h"
#include "libavutil/mips/mmiutils.h"
-void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
@@ -207,7 +207,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
}
}
-void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
@@ -396,7 +396,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
}
}
-void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
@@ -546,7 +546,7 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
}
}
-void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y)
{
const int A = (8 - x) *(8 - y);
diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
index 67d0bc1..940e12d 100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -1869,7 +1869,7 @@ static void avg_width8_msa(uint8_t *src, int32_t src_stride,
}
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
@@ -1886,7 +1886,7 @@ void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
}
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
int32_t cnt;
@@ -1910,7 +1910,7 @@ void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
}
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
int32_t cnt;
@@ -1934,7 +1934,7 @@ void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
}
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
@@ -1955,7 +1955,7 @@ void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
}
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
@@ -1975,7 +1975,7 @@ void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
}
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
+ ptrdiff_t stride, int height, int x, int y)
{
int32_t cnt;
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index cb1e095..d9b2a61 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -111,7 +111,9 @@
#ifdef PREFIX_h264_chroma_mc8_altivec
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
- int stride, int h, int x, int y) {
+ ptrdiff_t stride, int h,
+ int x, int y)
+{
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
@@ -183,7 +185,10 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
/* this code assume that stride % 16 == 0 */
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
-static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
+static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h,
+ int x, int y)
+{
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index 95ba0a9..5579bd9 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -292,7 +292,10 @@ static const int rv40_bias[4][4] = {
};
#define RV40_CHROMA_MC(OPNAME, OP)\
-static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
+ uint8_t *src /*align 1*/,\
+ ptrdiff_t stride, int h, int x, int y)\
+{\
const int A = (8-x) * (8-y);\
const int B = ( x) * (8-y);\
const int C = (8-x) * ( y);\
@@ -313,7 +316,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
}\
}else{\
const int E = B + C;\
- const int step = C ? stride : 1;\
+ const ptrdiff_t step = C ? stride : 1;\
for(i = 0; i < h; i++){\
OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
@@ -325,7 +328,10 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
}\
}\
\
-static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
+ uint8_t *src/*align 1*/,\
+ ptrdiff_t stride, int h, int x, int y)\
+{\
const int A = (8-x) * (8-y);\
const int B = ( x) * (8-y);\
const int C = (8-x) * ( y);\
@@ -350,7 +356,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
}\
}else{\
const int E = B + C;\
- const int step = C ? stride : 1;\
+ const ptrdiff_t step = C ? stride : 1;\
for(i = 0; i < h; i++){\
OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index eaadebe..9239a4a 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -782,7 +782,7 @@ PUT_VC1_MSPEL(3, 3)
C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
uint8_t *src /* align 1 */,
- int stride, int h, int x, int y)
+ ptrdiff_t stride, int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
const int B = (x) * (8 - y);
@@ -807,7 +807,7 @@ static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
}
static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y)
+ ptrdiff_t stride, int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
const int B = (x) * (8 - y);
@@ -830,7 +830,7 @@ static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
#define avg2(a, b) (((a) + (b) + 1) >> 1)
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
uint8_t *src /* align 1 */,
- int stride, int h, int x, int y)
+ ptrdiff_t stride, int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
const int B = (x) * (8 - y);
@@ -856,7 +856,7 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
uint8_t *src /* align 1 */,
- int stride, int h, int x, int y)
+ ptrdiff_t stride, int h, int x, int y)
{
const int A = (8 - x) * (8 - y);
const int B = ( x) * (8 - y);
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index fa698e5..b5a78b5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -105,11 +105,8 @@ SECTION .text
%endif ; rv40
; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
; uint8_t *src /* align 1 */,
-; int stride, int h, int mx, int my)
+; ptrdiff_t stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
@@ -291,9 +288,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
%endif ; PIC
%endif ; rv40
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
pxor m7, m7
movd m2, r4d ; x
movd m3, r5d ; y
@@ -376,10 +370,6 @@ cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
-
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
@@ -465,9 +455,6 @@ chroma_mc4_mmx_func avg, rv40
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
@@ -613,9 +600,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
mov r6, r4
shl r4d, 8
sub r4d, r6d
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index c358482..34bc419 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -57,12 +57,11 @@ SECTION .text
%endmacro
;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h,
-; int mx, int my)
+; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride,
+; int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC8 1
cglobal %1_h264_chroma_mc8_10, 6,7,8
- movsxdifnidn r2, r2d
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
@@ -149,8 +148,8 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
%endmacro
;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h,
-; int mx, int my)
+; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride,
+; int h, int mx, int my)
;-----------------------------------------------------------------------------
;TODO: xmm mc4
%macro MC4_OP 2
@@ -174,7 +173,6 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
%macro CHROMA_MC4 1
cglobal %1_h264_chroma_mc4_10, 6,6,7
- movsxdifnidn r2, r2d
movd m2, r4m ; x
movd m3, r5m ; y
mova m4, [pw_8]
@@ -200,12 +198,11 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7
%endmacro
;-----------------------------------------------------------------------------
-; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h,
-; int mx, int my)
+; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride,
+; int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC2 1
cglobal %1_h264_chroma_mc2_10, 6,7
- movsxdifnidn r2, r2d
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index e08af27..36bf29d 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -25,38 +25,38 @@
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, \
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
CHROMA_MC(put, 2, 10, mmxext)
CHROMA_MC(avg, 2, 10, mmxext)
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 218deb8..340173d 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -41,18 +41,18 @@ static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src,
#if HAVE_YASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index e05ae06..79d22a2 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -83,15 +83,15 @@ DECLARE_FUNCTION(avg_, 16, _sse2)
#endif /* HAVE_YASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
int16_t *block);
void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
======================================================================
diff --cc libavcodec/arm/vc1dsp_init_neon.c
index c340144,08c07c4..005d45c
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@@ -37,52 -37,50 +37,52 @@@ void ff_vc1_inv_trans_4x4_dc_neon(uint8
void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int rnd);
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride, int rnd) \
+{ \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+ dst += 8*stride; src += 8*stride; \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
- void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
- void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
- void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
- void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
- int x, int y);
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+#define FN_ASSIGN(X, Y) \
+ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
{
dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
diff --cc libavcodec/h264chroma.h
index e0f45ad,9fc2a0f..5c89fd1
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@@ -19,13 -19,14 +19,14 @@@
#ifndef AVCODEC_H264CHROMA_H
#define AVCODEC_H264CHROMA_H
+ #include <stddef.h>
#include <stdint.h>
- typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+ typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y);
typedef struct H264ChromaContext {
- h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
- h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+ h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
+ h264_chroma_mc_func avg_h264_chroma_pixels_tab[4];
} H264ChromaContext;
void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
diff --cc libavcodec/h264chroma_template.c
index 072b5e0,ed364dd..a3ca07b
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@@ -19,46 -19,14 +19,49 @@@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
- #include "libavutil/avassert.h"
-#include <assert.h>
+
+ #include <stddef.h>
+
++#include "libavutil/avassert.h"
#include "bit_depth_template.c"
#define H264_CHROMA_MC(OPNAME, OP)\
- static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
++static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
+ pixel *dst = (pixel*)_dst;\
+ pixel *src = (pixel*)_src;\
+ const int A=(8-x)*(8-y);\
+ const int B=( x)*(8-y);\
+ const int C=(8-x)*( y);\
+ const int D=( x)*( y);\
+ int i;\
+ stride >>= sizeof(pixel)-1;\
+ \
+ av_assert2(x<8 && y<8 && x>=0 && y>=0);\
+\
+ if(D){\
+ for(i=0; i<h; i++){\
+ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+ dst+= stride;\
+ src+= stride;\
+ }\
+ } else if (B + C) {\
+ const int E= B+C;\
+ const int step= C ? stride : 1;\
+ for(i=0; i<h; i++){\
+ OP(dst[0], (A*src[0] + E*src[step+0]));\
+ dst+= stride;\
+ src+= stride;\
+ }\
+ } else {\
+ for(i=0; i<h; i++){\
+ OP(dst[0], (A*src[0]));\
+ dst+= stride;\
+ src+= stride;\
+ }\
+ }\
+}\
- static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+ {\
pixel *dst = (pixel*)_dst;\
pixel *src = (pixel*)_src;\
const int A=(8-x)*(8-y);\
diff --cc libavcodec/mips/h264chroma_mips.h
index 6e6127d,0000000..996384d
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_mips.h
+++ b/libavcodec/mips/h264chroma_mips.h
@@@ -1,47 -1,0 +1,47 @@@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264CHROMA_MIPS_H
+#define AVCODEC_MIPS_H264CHROMA_MIPS_H
+
+#include "libavcodec/h264dec.h"
- void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
- void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
- void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
- void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
- void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
- void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int height, int x, int y);
+
- void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
- void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
- void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
- void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --cc libavcodec/mips/h264chroma_mmi.c
index 417b4a2,0000000..bafe0f9
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_mmi.c
+++ b/libavcodec/mips/h264chroma_mmi.c
@@@ -1,704 -1,0 +1,704 @@@
+/*
+ * Loongson SIMD optimized h264chroma
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
+ * Zhang Shuangshuang <zhangshuangshuang at ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+#include "constants.h"
+#include "libavutil/mips/mmiutils.h"
+
- void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y)
+{
+ const int A = (8 - x) * (8 - y);
+ const int B = x * (8 - y);
+ const int C = (8 - x) * y;
+ const int D = x * y;
+ const int E = B + C;
+ double ftmp[10];
+ uint64_t tmp[1];
+ mips_reg addr[1];
+ DECLARE_VAR_ALL64;
+
+ if (D) {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[B], %[B], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "pshufh %[C], %[C], %[ftmp0] \n\t"
+ "pshufh %[D], %[D], %[ftmp0] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[stride] \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ MMI_ULDC1(%[ftmp2], %[src], 0x01)
+ MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+ MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
+
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
+ "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
+ "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+
+ "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
+ "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
+ "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
+ "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
+ "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [B]"f"(B),
+ [C]"f"(C), [D]"f"(D)
+ : "memory"
+ );
+ } else if (E) {
+ const int step = C ? stride : 1;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[E], %[E], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[step] \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
+
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
+ "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
+ "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
+ "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+ [ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [E]"f"(E)
+ : "memory"
+ );
+ } else {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp4] \n\t"
+
+ "1: \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "pmullh %[ftmp2], %[ftmp3], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "pmullh %[ftmp2], %[ftmp3], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x02 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A)
+ : "memory"
+ );
+ }
+}
+
- void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y)
+{
+ const int A = (8 - x) * (8 - y);
+ const int B = x * (8 - y);
+ const int C = (8 - x) * y;
+ const int D = x * y;
+ const int E = B + C;
+ double ftmp[10];
+ uint64_t tmp[1];
+ mips_reg addr[1];
+ DECLARE_VAR_ALL64;
+
+ if (D) {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[B], %[B], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "pshufh %[C], %[C], %[ftmp0] \n\t"
+ "pshufh %[D], %[D], %[ftmp0] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[stride] \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ MMI_ULDC1(%[ftmp2], %[src], 0x01)
+ MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+ MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
+
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
+ "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
+ "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+
+ "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
+ "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
+ "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
+ "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
+ "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ MMI_LDC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [B]"f"(B),
+ [C]"f"(C), [D]"f"(D)
+ : "memory"
+ );
+ } else if (E) {
+ const int step = C ? stride : 1;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[E], %[E], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[step] \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
+
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
+ "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
+ "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
+ "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ MMI_LDC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+ [ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [E]"f"(E)
+ : "memory"
+ );
+ } else {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp4] \n\t"
+
+ "1: \n\t"
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "pmullh %[ftmp2], %[ftmp3], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ MMI_LDC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+
+ MMI_ULDC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "pmullh %[ftmp2], %[ftmp3], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ MMI_LDC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x02 \n\t"
+ MMI_SDC1(%[ftmp1], %[dst], 0x00)
+
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A)
+ : "memory"
+ );
+ }
+}
+
- void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y)
+{
+ const int A = (8 - x) * (8 - y);
+ const int B = x * (8 - y);
+ const int C = (8 - x) * y;
+ const int D = x * y;
+ const int E = B + C;
+ double ftmp[8];
+ uint64_t tmp[1];
+ mips_reg addr[1];
+ DECLARE_VAR_LOW32;
+
+ if (D) {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[B], %[B], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "pshufh %[C], %[C], %[ftmp0] \n\t"
+ "pshufh %[D], %[D], %[ftmp0] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[stride] \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ MMI_ULWC1(%[ftmp2], %[src], 0x01)
+ MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+ MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
+
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+
+ "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
+ "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [B]"f"(B),
+ [C]"f"(C), [D]"f"(D)
+ : "memory"
+ );
+ } else if (E) {
+ const int step = C ? stride : 1;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[E], %[E], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[step] \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
+
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
+ "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
+ "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+ [ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [E]"f"(E)
+ : "memory"
+ );
+ } else {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp3] \n\t"
+
+ "1: \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "addi %[h], %[h], -0x02 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A)
+ : "memory"
+ );
+ }
+}
+
- void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
++void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y)
+{
+ const int A = (8 - x) *(8 - y);
+ const int B = x * (8 - y);
+ const int C = (8 - x) * y;
+ const int D = x * y;
+ const int E = B + C;
+ double ftmp[8];
+ uint64_t tmp[1];
+ mips_reg addr[1];
+ DECLARE_VAR_LOW32;
+
+ if (D) {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[B], %[B], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+ "pshufh %[C], %[C], %[ftmp0] \n\t"
+ "pshufh %[D], %[D], %[ftmp0] \n\t"
+
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[stride] \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ MMI_ULWC1(%[ftmp2], %[src], 0x01)
+ MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+ MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
+
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+
+ "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
+ "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ MMI_LWC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [B]"f"(B),
+ [C]"f"(C), [D]"f"(D)
+ : "memory"
+ );
+ } else if (E) {
+ const int step = C ? stride : 1;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "pshufh %[E], %[E], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+ "1: \n\t"
+ PTR_ADDU "%[addr0], %[src], %[step] \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
+
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
+ "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
+ "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ MMI_LWC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x01 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [addr0]"=&r"(addr[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+ [ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A), [E]"f"(E)
+ : "memory"
+ );
+ } else {
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dli %[tmp0], 0x06 \n\t"
+ "pshufh %[A], %[A], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp3] \n\t"
+
+ "1: \n\t"
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ MMI_LWC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+
+ MMI_ULWC1(%[ftmp1], %[src], 0x00)
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "pmullh %[ftmp1], %[ftmp2], %[A] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ MMI_LWC1(%[ftmp2], %[dst], 0x00)
+ "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "addi %[h], %[h], -0x02 \n\t"
+ MMI_SWC1(%[ftmp1], %[dst], 0x00)
+
+ PTR_ADDU "%[src], %[src], %[stride] \n\t"
+ PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
+ "bnez %[h], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_LOW32
+ [dst]"+&r"(dst), [src]"+&r"(src),
+ [h]"+&r"(h)
+ : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+ [A]"f"(A)
+ : "memory"
+ );
+ }
+}
diff --cc libavcodec/mips/h264chroma_msa.c
index 67d0bc1,0000000..940e12d
mode 100644,000000..100644
--- a/libavcodec/mips/h264chroma_msa.c
+++ b/libavcodec/mips/h264chroma_msa.c
@@@ -1,2003 -1,0 +1,2003 @@@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil at imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264chroma_mips.h"
+
+static const uint8_t chroma_mask_arr[16 * 5] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint16_t out0, out1;
+ v16i8 src0, src1;
+ v8u16 res_r;
+ v8i16 res;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ LD_SB2(src, src_stride, src0, src1);
+
+ src0 = __msa_vshf_b(mask, src1, src0);
+ res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ out0 = __msa_copy_u_h(res, 0);
+ out1 = __msa_copy_u_h(res, 2);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3;
+ v8u16 res_r;
+ v8i16 res;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[64]);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+ src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+ res_r = __msa_dotp_u_h(src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 res_r;
+ v8i16 res;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[64]);
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+
+ ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+
+ res_r = __msa_dotp_u_h(src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ res_r = __msa_dotp_u_h(src4, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else if (4 == height) {
+ avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else if (8 == height) {
+ avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ }
+}
+
+static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16i8 src0, src1;
+ v8u16 res_r;
+ v4i32 res;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ LD_SB2(src, src_stride, src0, src1);
+
+ src0 = __msa_vshf_b(mask, src1, src0);
+ res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3;
+ v8u16 res0_r, res1_r;
+ v4i32 res0, res1;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+ DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+ res0_r <<= 3;
+ res1_r <<= 3;
+
+ SRARI_H2_UH(res0_r, res1_r, 6);
+ SAT_UH2_UH(res0_r, res1_r, 7);
+ PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hz_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else {
+ avc_chroma_hz_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+ coeff1, height);
+ }
+}
+
+static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, out0, out1;
+ v8u16 res0, res1, res2, res3;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[32]);
+
+ for (row = height >> 2; row--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+ coeff_vec, res0, res1, res2, res3);
+ SLLI_4V(res0, res1, res2, res3, 3);
+ SRARI_H4_UH(res0, res1, res2, res3, 6);
+ SAT_UH4_UH(res0, res1, res2, res3, 7);
+ PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+
+ if (0 != (height % 4)) {
+ for (row = (height % 4); row--;) {
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+
+ res0 = __msa_dotp_u_h(src0, coeff_vec);
+ res0 <<= 3;
+ res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
+ res0 = __msa_sat_u_h(res0, 7);
+ res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+
+ ST8x1_UB(res0, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint16_t out0, out1;
+ v16i8 src0, src1, src2;
+ v16u8 tmp0, tmp1;
+ v8i16 res;
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+
+ ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ out0 = __msa_copy_u_h(res, 0);
+ out1 = __msa_copy_u_h(res, 2);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 res;
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 res;
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+ LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else if (4 == height) {
+ avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else if (8 == height) {
+ avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ }
+}
+
+static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2;
+ v16u8 tmp0, tmp1;
+ v4i32 res;
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_UB3(src, src_stride, src0, src1, src2);
+ ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+ ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8u16 res0_r, res1_r;
+ v4i32 res0, res1;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+ DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+ res0_r <<= 3;
+ res1_r <<= 3;
+
+ SRARI_H2_UH(res0_r, res1_r, 6);
+ SAT_UH2_UH(res0_r, res1_r, 7);
+ PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_vt_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+ } else {
+ avc_chroma_vt_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+ coeff1, height);
+ }
+}
+
+static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4, out0, out1;
+ v8u16 res0, res1, res2, res3;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (row = height >> 2; row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ src0, src1, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+ coeff_vec, res0, res1, res2, res3);
+ SLLI_4V(res0, res1, res2, res3, 3);
+ SRARI_H4_UH(res0, res1, res2, res3, 6);
+ SAT_UH4_UH(res0, res1, res2, res3, 7);
+ PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1)
+{
+ uint16_t out0, out1;
+ v16u8 src0, src1, src2;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v8i16 res_vert;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB3(src, src_stride, src0, src1, src2);
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ out0 = __msa_copy_u_h(res_vert, 0);
+ out1 = __msa_copy_u_h(res_vert, 1);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v8i16 res;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v8i16 res;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+ LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+ coef_hor1, coef_ver0, coef_ver1);
+ } else if (4 == height) {
+ avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0,
+ coef_hor1, coef_ver0, coef_ver1);
+ } else if (8 == height) {
+ avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0,
+ coef_hor1, coef_ver0, coef_ver1);
+ }
+}
+
+static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v16i8 mask;
+ v4i32 res;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+ LD_UB3(src, src_stride, src0, src1, src2);
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+ v4i32 res0, res1;
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3);
+ MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3);
+ ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+ SRARI_H2_UH(res_vt0, res_vt1, 6);
+ SAT_UH2_UH(res_vt0, res_vt1, 7);
+ PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+ coef_hor1, coef_ver0, coef_ver1);
+ } else {
+ avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1, coef_ver0,
+ coef_ver1, height);
+ }
+}
+
+static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0, uint32_t coef_hor1,
+ uint32_t coef_ver0, uint32_t coef_ver1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4, out0, out1;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[32]);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+ res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+ VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+ DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+ res_hz4);
+ MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+ coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3);
+
+ res_vt0 += (res_hz0 * coeff_vt_vec1);
+ res_vt1 += (res_hz1 * coeff_vt_vec1);
+ res_vt2 += (res_hz2 * coeff_vt_vec1);
+ res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+ SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+ SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+ PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ dst += (4 * dst_stride);
+
+ res_hz0 = res_hz4;
+ }
+}
+
+static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint16_t out0, out1;
+ uint32_t load0, load1;
+ v16i8 src0, src1;
+ v16u8 dst_data = { 0 };
+ v8u16 res_r;
+ v16u8 res;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ LD_SB2(src, src_stride, src0, src1);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst_data);
+
+ src0 = __msa_vshf_b(mask, src1, src0);
+
+ res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ dst_data = __msa_aver_u_b(res, dst_data);
+
+ out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+ out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8u16 res_r;
+ v16i8 res, mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[64]);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+ src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+ res_r = __msa_dotp_u_h(src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+ ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8u16 res0_r, res1_r;
+ v16u8 res0, res1, mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_UB(&chroma_mask_arr[64]);
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+ ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+ DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
+
+ res0_r <<= 3;
+ res1_r <<= 3;
+
+ SRARI_H2_UH(res0_r, res1_r, 6);
+ SAT_UH2_UH(res0_r, res1_r, 7);
+ PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
+
+ ST2x4_UB(dst0, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else if (4 == height) {
+ avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else if (8 == height) {
+ avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ }
+}
+
+static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint32_t load0, load1;
+ v16i8 src0, src1;
+ v16u8 dst_data = { 0 };
+ v8u16 res_r;
+ v16i8 res, mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ LD_SB2(src, src_stride, src0, src1);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst_data);
+
+ src0 = __msa_vshf_b(mask, src1, src0);
+
+ res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+
+ ST4x2_UB(dst_data, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint32_t coeff0,
+ uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t load0, load1;
+ uint32_t row;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0 = { 0 };
+ v16u8 dst1 = { 0 };
+ v8u16 res0_r, res1_r;
+ v16u8 res0, res1, mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_UB(&chroma_mask_arr[0]);
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst0);
+
+ load0 = LW(dst + 2 * dst_stride);
+ load1 = LW(dst + 3 * dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst1);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+ DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+ res0_r <<= 3;
+ res1_r <<= 3;
+
+ SRARI_H2_UH(res0_r, res1_r, 6);
+ SAT_UH2_UH(res0_r, res1_r, 7);
+ PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+ ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else {
+ avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride,
+ dst, dst_stride,
+ coeff0, coeff1, height);
+ }
+}
+
+static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, out0, out1;
+ v8u16 res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 mask;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ mask = LD_SB(&chroma_mask_arr[32]);
+
+ for (row = height >> 2; row--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+ coeff_vec, res0, res1, res2, res3);
+ SLLI_4V(res0, res1, res2, res3, 3);
+ SRARI_H4_UH(res0, res1, res2, res3, 6);
+ SAT_UH4_UH(res0, res1, res2, res3, 7);
+ PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint16_t out0, out1;
+ uint32_t load0, load1;
+ v16i8 src0, src1, src2, tmp0, tmp1, res;
+ v16u8 dst_data = { 0 };
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst_data);
+
+ ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+ tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+ res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+ out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+ out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint32_t load0, load1;
+ v16i8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8u16 res_r;
+ v8i16 res;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+ v16u8 dst_data = { 0 };
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+
+ load0 = LW(dst + 2 * dst_stride);
+ load1 = LW(dst + 3 * dst_stride);
+
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
+ dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint32_t load0, load1, load2, load3;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 res;
+ v8u16 res_r;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+ v16u8 dst_data0 = { 0 };
+ v16u8 dst_data1 = { 0 };
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+
+ LW4(dst, dst_stride, load0, load1, load2, load3);
+
+ dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
+ dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
+ dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
+ dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
+
+ LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
+
+ dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
+ dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
+ dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
+ dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+ tmp0, tmp1, tmp2, tmp3);
+
+ ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+ tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+
+ res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
+
+ ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else if (4 == height) {
+ avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else if (8 == height) {
+ avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ }
+}
+
+static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1)
+{
+ uint32_t load0, load1;
+ v16i8 src0, src1, src2, tmp0, tmp1;
+ v16u8 dst_data = { 0 };
+ v8u16 res_r;
+ v16u8 res;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst_data);
+ ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+ tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+ res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+ res_r <<= 3;
+ res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+ res_r = __msa_sat_u_h(res_r, 7);
+ res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+ res = __msa_aver_u_b(res, dst_data);
+
+ ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint32_t coeff0,
+ uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t load0, load1, row;
+ v16i8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 dst0 = { 0 };
+ v16u8 dst1 = { 0 };
+ v8u16 res0_r, res1_r;
+ v16u8 res0, res1;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ for (row = (height >> 2); row--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ load0 = LW(dst);
+ load1 = LW(dst + dst_stride);
+
+ INSERT_W2_UB(load0, load1, dst0);
+ load0 = LW(dst + 2 * dst_stride);
+ load1 = LW(dst + 3 * dst_stride);
+ INSERT_W2_UB(load0, load1, dst1);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ tmp0, tmp1, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+ DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+ res0_r <<= 3;
+ res1_r <<= 3;
+
+ SRARI_H2_UH(res0_r, res1_r, 6);
+ SAT_UH2_UH(res0_r, res1_r, 7);
+ PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1);
+ } else {
+ avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+ coeff0, coeff1, height);
+ }
+}
+
+static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coeff0, uint32_t coeff1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1;
+ v8u16 res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+ v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+ v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (row = height >> 2; row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ src0, src1, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+ coeff_vec, res0, res1, res2, res3);
+ SLLI_4V(res0, res1, res2, res3, 3);
+ SRARI_H4_UH(res0, res1, res2, res3, 6);
+ SAT_UH4_UH(res0, res1, res2, res3, 7);
+ PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1)
+{
+ uint16_t out0, out1;
+ v16u8 dst0, dst1;
+ v16u8 src0, src1, src2;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v16i8 res, mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB3(src, src_stride, src0, src1, src2);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+ dst0 = __msa_aver_u_b((v16u8) res, dst0);
+ out0 = __msa_copy_u_h((v8i16) dst0, 0);
+ out1 = __msa_copy_u_h((v8i16) dst0, 1);
+
+ SH(out0, dst);
+ dst += dst_stride;
+ SH(out1, dst);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v16i8 res, mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+ dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+ ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v16i8 res, mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[48]);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+ LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+ dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+ dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+ VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+ ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+ dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+ ST2x4_UB(dst0, 0, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+ dst4 = __msa_aver_u_b((v16u8) res, dst4);
+
+ ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1,
+ coef_ver0, coef_ver1);
+ } else if (4 == height) {
+ avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1,
+ coef_ver0, coef_ver1);
+ } else if (8 == height) {
+ avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1,
+ coef_ver0, coef_ver1);
+ }
+}
+
+static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1)
+{
+ v16u8 src0, src1, src2;
+ v16u8 dst0, dst1;
+ v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+ v16i8 res, mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ LD_UB3(src, src_stride, src0, src1, src2);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+ MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+ res_vt0 += res_vt1;
+ res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+ res_vt0 = __msa_sat_u_h(res_vt0, 7);
+ res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+ dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+ dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+ ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+ v16u8 res0, res1;
+
+ mask = LD_SB(&chroma_mask_arr[0]);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+ VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+ DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3);
+ MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3);
+ ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+ SRARI_H2_UH(res_vt0, res_vt1, 6);
+ SAT_UH2_UH(res_vt0, res_vt1, 7);
+ PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+ dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+ dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+
+ AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+ ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ src0 = src4;
+ }
+}
+
+static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1,
+ int32_t height)
+{
+ if (2 == height) {
+ avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1,
+ coef_ver0, coef_ver1);
+ } else {
+ avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+ coef_hor0, coef_hor1,
+ coef_ver0, coef_ver1, height);
+ }
+}
+
+static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ uint32_t coef_hor0,
+ uint32_t coef_hor1,
+ uint32_t coef_ver0,
+ uint32_t coef_ver1,
+ int32_t height)
+{
+ uint32_t row;
+ v16u8 src0, src1, src2, src3, src4, out0, out1;
+ v8u16 res_hz0, res_hz1, res_hz2;
+ v8u16 res_hz3, res_hz4;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+ v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+ v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+ v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+ v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+ mask = LD_SB(&chroma_mask_arr[32]);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+ res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+ for (row = (height >> 2); row--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+ VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+ DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+ res_hz4);
+ MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+ coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3);
+
+ res_vt0 += (res_hz0 * coeff_vt_vec1);
+ res_vt1 += (res_hz1 * coeff_vt_vec1);
+ res_vt2 += (res_hz2 * coeff_vt_vec1);
+ res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+ SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+ SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+
+ PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ res_hz0 = res_hz4;
+ }
+}
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64) src0, 0);
+ out1 = __msa_copy_u_d((v2i64) src1, 0);
+ out2 = __msa_copy_u_d((v2i64) src2, 0);
+ out3 = __msa_copy_u_d((v2i64) src3, 0);
+ out4 = __msa_copy_u_d((v2i64) src4, 0);
+ out5 = __msa_copy_u_d((v2i64) src5, 0);
+ out6 = __msa_copy_u_d((v2i64) src6, 0);
+ out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64) src0, 0);
+ out1 = __msa_copy_u_d((v2i64) src1, 0);
+ out2 = __msa_copy_u_d((v2i64) src2, 0);
+ out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ for (cnt = height >> 3; cnt--;) {
+ LD_UB8(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64) src0, 0);
+ out1 = __msa_copy_u_d((v2i64) src1, 0);
+ out2 = __msa_copy_u_d((v2i64) src2, 0);
+ out3 = __msa_copy_u_d((v2i64) src3, 0);
+ out4 = __msa_copy_u_d((v2i64) src4, 0);
+ out5 = __msa_copy_u_d((v2i64) src5, 0);
+ out6 = __msa_copy_u_d((v2i64) src6, 0);
+ out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 4) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ out0 = __msa_copy_u_d((v2i64) src0, 0);
+ out1 = __msa_copy_u_d((v2i64) src1, 0);
+ out2 = __msa_copy_u_d((v2i64) src2, 0);
+ out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 2) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ out0 = __msa_copy_u_d((v2i64) src0, 0);
+ out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+ SD(out0, dst);
+ dst += dst_stride;
+ SD(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width4_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t cnt;
+ uint32_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ if (0 == (height % 4)) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ out0 = __msa_copy_u_w((v4i32) dst0, 0);
+ out1 = __msa_copy_u_w((v4i32) dst1, 0);
+ out2 = __msa_copy_u_w((v4i32) dst2, 0);
+ out3 = __msa_copy_u_w((v4i32) dst3, 0);
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == (height % 2)) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ LD_UB2(dst, dst_stride, dst0, dst1);
+
+ AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+ out0 = __msa_copy_u_w((v4i32) dst0, 0);
+ out1 = __msa_copy_u_w((v4i32) dst1, 0);
+ SW(out0, dst);
+ dst += dst_stride;
+ SW(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ out0 = __msa_copy_u_d((v2i64) dst0, 0);
+ out1 = __msa_copy_u_d((v2i64) dst1, 0);
+ out2 = __msa_copy_u_d((v2i64) dst2, 0);
+ out3 = __msa_copy_u_d((v2i64) dst3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+ if (x && y) {
+ avc_chroma_hv_8w_msa(src, stride, dst,
+ stride, x, (8 - x), y, (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_8w_msa(src, stride, dst, stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_8w_msa(src, stride, dst, stride, y, (8 - y), height);
+ } else {
+ copy_width8_msa(src, stride, dst, stride, height);
+ }
+}
+
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ int32_t cnt;
+
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+ if (x && y) {
+ avc_chroma_hv_4w_msa(src, stride, dst,
+ stride, x, (8 - x), y, (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_4w_msa(src, stride, dst, stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_4w_msa(src, stride, dst, stride, y, (8 - y), height);
+ } else {
+ for (cnt = height; cnt--;) {
+ *((uint32_t *) dst) = *((uint32_t *) src);
+
+ src += stride;
+ dst += stride;
+ }
+ }
+}
+
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ int32_t cnt;
+
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+ if (x && y) {
+ avc_chroma_hv_2w_msa(src, stride, dst,
+ stride, x, (8 - x), y, (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_2w_msa(src, stride, dst, stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_2w_msa(src, stride, dst, stride, y, (8 - y), height);
+ } else {
+ for (cnt = height; cnt--;) {
+ *((uint16_t *) dst) = *((uint16_t *) src);
+
+ src += stride;
+ dst += stride;
+ }
+ }
+}
+
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+
+ if (x && y) {
+ avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
+ stride, x, (8 - x), y,
+ (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
+ stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
+ stride, y, (8 - y), height);
+ } else {
+ avg_width8_msa(src, stride, dst, stride, height);
+ }
+}
+
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+ if (x && y) {
+ avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
+ stride, x, (8 - x), y,
+ (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
+ stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
+ stride, y, (8 - y), height);
+ } else {
+ avg_width4_msa(src, stride, dst, stride, height);
+ }
+}
+
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
- int stride, int height, int x, int y)
++ ptrdiff_t stride, int height, int x, int y)
+{
+ int32_t cnt;
+
+ av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+ if (x && y) {
+ avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
+ stride, x, (8 - x), y,
+ (8 - y), height);
+ } else if (x) {
+ avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
+ stride, x, (8 - x), height);
+ } else if (y) {
+ avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
+ stride, y, (8 - y), height);
+ } else {
+ for (cnt = height; cnt--;) {
+ dst[0] = (dst[0] + src[0] + 1) >> 1;
+ dst[1] = (dst[1] + src[1] + 1) >> 1;
+
+ src += stride;
+ dst += stride;
+ }
+ }
+}
diff --cc libavcodec/ppc/h264chroma_template.c
index cb1e095,daa7652..d9b2a61
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@@ -72,46 -70,11 +72,48 @@@
#define noop(a) a
#define add28(a) vec_add(v28ss, a)
+#if HAVE_BIGENDIAN
+#define GET_VSRC1(vs0, off, b, perm0, s){ \
+ vec_u8 vsrcCuc, vsrcDuc; \
+ vsrcCuc = vec_ld(off, s); \
+ if (loadSecond){ \
+ vsrcDuc = vec_ld(off + b, s); \
+ } else \
+ vsrcDuc = vsrcCuc; \
+ \
+ vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
+}
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+ vec_u8 vsrcCuc, vsrcDuc; \
+ vsrcCuc = vec_ld(off, s); \
+ if (loadSecond){ \
+ vsrcDuc = vec_ld(off + b, s); \
+ } else \
+ vsrcDuc = vsrcCuc; \
+ \
+ vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
+ if (reallyBadAlign){ \
+ vs1 = vsrcDuc; \
+ } else \
+ vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1); \
+ }
+
+#else
+
+#define GET_VSRC1(vs0, off, b, perm0, s){ \
+ vs0 = vec_vsx_ld(off, s); \
+ }
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+ vs0 = vec_vsx_ld(off, s); \
+ vs1 = vec_vsx_ld(off + 1, s); \
+ }
+#endif /* HAVE_BIGENDIAN */
+
#ifdef PREFIX_h264_chroma_mc8_altivec
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
- int stride, int h, int x, int y) {
+ ptrdiff_t stride, int h,
+ int x, int y)
+ {
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
diff --cc libavcodec/x86/rv40dsp_init.c
index 218deb8,7bf3ecd..340173d
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@@ -32,27 -32,20 +32,27 @@@
#include "libavutil/x86/cpu.h"
#include "hpeldsp.h"
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
#if HAVE_YASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
diff --cc libavcodec/x86/vc1dsp_init.c
index e05ae06,8982ff9..79d22a2
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@@ -83,23 -71,15 +83,23 @@@ DECLARE_FUNCTION(avg_, 16, _sse2
#endif /* HAVE_YASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
+ ptrdiff_t stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+ int16_t *block);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
More information about the ffmpeg-cvslog
mailing list