[FFmpeg-cvslog] avcodec/mips: Improve hevc bi copy mc msa functions
Kaustubh Raste
git at videolan.org
Fri Oct 13 04:20:10 EEST 2017
ffmpeg | branch: master | Kaustubh Raste <kaustubh.raste at imgtec.com> | Wed Oct 11 14:37:27 2017 +0530| [e63758468c6420871c883fb96b8b6832adb5b2ee] | committer: Michael Niedermayer
avcodec/mips: Improve hevc bi copy mc msa functions
Load the specific destination bytes instead of MSA load and pack.
Use immediate unsigned saturation for clip to max saving one vector register.
Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e63758468c6420871c883fb96b8b6832adb5b2ee
---
libavcodec/mips/hevc_mc_bi_msa.c | 652 ++++++++++++++++++++++-----------------
1 file changed, 366 insertions(+), 286 deletions(-)
diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
index 8208be327d..ccc3f8a64f 100644
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
*
* This file is part of FFmpeg.
*
@@ -37,6 +37,21 @@
HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
}
+#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
+ out0, out1) \
+{ \
+ ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
+ SRARI_H2_SH(out0, out1, rnd_val); \
+ CLIP_SH2_0_255_MAX_SATU(out0, out1); \
+}
+
+#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
+ vec3, rnd_val, out0, out1, out2, out3) \
+{ \
+ HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
+ HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
+}
+
static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
int32_t src_stride,
int16_t *src1_ptr,
@@ -45,71 +60,59 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
+ uint32_t loop_cnt, tp0, tp1, tp2, tp3;
+ uint64_t tpd0, tpd1, tpd2, tpd3;
+ v16i8 src0 = { 0 }, src1 = { 0 };
v16i8 zero = { 0 };
+ v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+ v8i16 dst0, dst1, dst2, dst3;
if (2 == height) {
- v16i8 src0, src1;
- v8i16 dst0, in0, in1;
-
- LD_SB2(src0_ptr, src_stride, src0, src1);
- LD_SH2(src1_ptr, src2_stride, in0, in1);
-
- src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
- in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+ LW2(src0_ptr, src_stride, tp0, tp1);
+ INSERT_W2_SB(tp0, tp1, src0);
+ LD2(src1_ptr, src2_stride, tpd0, tpd1);
+ INSERT_D2_SH(tpd0, tpd1, in0);
dst0 = (v8i16) __msa_ilvr_b(zero, src0);
dst0 <<= 6;
dst0 += in0;
dst0 = __msa_srari_h(dst0, 7);
- dst0 = CLIP_SH_0_255(dst0);
+ dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
ST4x2_UB(dst0, dst, dst_stride);
} else if (4 == height) {
- v16i8 src0, src1, src2, src3;
- v8i16 dst0, dst1;
- v8i16 in0, in1, in2, in3;
-
- LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
- LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
- ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
- ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
- ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
-
- dst0 <<= 6;
- dst1 <<= 6;
- HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
-
+ LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+ LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+ INSERT_D2_SH(tpd0, tpd1, in0);
+ INSERT_D2_SH(tpd2, tpd3, in1);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ SLLI_2V(dst0, dst1, 6);
+ HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == height % 8) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v8i16 dst0, dst1, dst2, dst3;
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_SB8(src0_ptr, src_stride,
- src0, src1, src2, src3, src4, src5, src6, src7);
- src0_ptr += (8 * src_stride);
-
- LD_SH8(src1_ptr, src2_stride,
- in0, in1, in2, in3, in4, in5, in6, in7);
- src1_ptr += (8 * src2_stride);
-
- ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
- ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
-
- ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
- src0, src1, src2, src3);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
-
+ LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += 4 * src_stride;
+ INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+ LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += 4 * src_stride;
+ INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
+ LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+ src1_ptr += (4 * src2_stride);
+ INSERT_D2_SH(tpd0, tpd1, in0);
+ INSERT_D2_SH(tpd2, tpd3, in1);
+ LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+ src1_ptr += (4 * src2_stride);
+ INSERT_D2_SH(tpd0, tpd1, in2);
+ INSERT_D2_SH(tpd2, tpd3, in3);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
SLLI_4V(dst0, dst1, dst2, dst3, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7,
- dst0, dst1, dst2, dst3);
-
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
+ dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, dst_stride);
dst += (8 * dst_stride);
@@ -126,36 +129,39 @@ static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
int32_t height)
{
uint32_t loop_cnt;
+ uint64_t tp0, tp1, tp2, tp3;
+ v16u8 out0, out1, out2, out3;
v16i8 zero = { 0 };
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_SB8(src0_ptr, src_stride,
- src0, src1, src2, src3, src4, src5, src6, src7);
- src0_ptr += (8 * src_stride);
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += (4 * src_stride);
+ INSERT_D2_SB(tp0, tp1, src0);
+ INSERT_D2_SB(tp2, tp3, src1);
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += (4 * src_stride);
+ INSERT_D2_SB(tp0, tp1, src2);
+ INSERT_D2_SB(tp2, tp3, src3);
LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
src1_ptr += (8 * src2_stride);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
- ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
- dst4, dst5, dst6, dst7);
-
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
+ ILVRL_B2_SH(zero, src3, dst6, dst7);
SLLI_4V(dst0, dst1, dst2, dst3, 6);
SLLI_4V(dst4, dst5, dst6, dst7, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST6x4_UB(dst0, dst1, dst, dst_stride);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+ 7, dst4, dst5, dst6, dst7);
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+ PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+ ST6x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
-
- HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
- dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
-
- PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5);
- ST6x4_UB(dst4, dst5, dst, dst_stride);
+ ST6x4_UB(out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
@@ -168,100 +174,83 @@ static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
+ uint64_t tp0, tp1, tp2, tp3;
+ v16u8 out0, out1, out2, out3;
+ v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
v16i8 zero = { 0 };
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
if (2 == height) {
- v16i8 src0, src1;
- v8i16 in0, in1;
- v8i16 dst0, dst1;
-
- LD_SB2(src0_ptr, src_stride, src0, src1);
+ LD2(src0_ptr, src_stride, tp0, tp1);
+ INSERT_D2_SB(tp0, tp1, src0);
LD_SH2(src1_ptr, src2_stride, in0, in1);
- ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
-
- dst0 <<= 6;
- dst1 <<= 6;
- HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
-
- dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(dst0, dst, dst_stride);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ SLLI_2V(dst0, dst1, 6);
+ HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
+ out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+ ST8x2_UB(out0, dst, dst_stride);
} else if (4 == height) {
- v16i8 src0, src1, src2, src3;
- v8i16 in0, in1, in2, in3;
- v8i16 dst0, dst1, dst2, dst3;
-
- LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_SB(tp0, tp1, src0);
+ INSERT_D2_SB(tp2, tp3, src1);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
-
SLLI_4V(dst0, dst1, dst2, dst3, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
} else if (6 == height) {
- v16i8 src0, src1, src2, src3, src4, src5;
- v8i16 in0, in1, in2, in3, in4, in5;
- v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
-
- LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += 4 * src_stride;
+ INSERT_D2_SB(tp0, tp1, src0);
+ INSERT_D2_SB(tp2, tp3, src1);
+ LD2(src0_ptr, src_stride, tp0, tp1);
+ INSERT_D2_SB(tp0, tp1, src2);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
- ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
-
SLLI_4V(dst0, dst1, dst2, dst3, 6);
- dst4 <<= 6;
- dst5 <<= 6;
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
- HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ SLLI_2V(dst4, dst5, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
+ PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+ ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
- ST8x2_UB(dst2, dst, dst_stride);
+ ST8x2_UB(out2, dst, dst_stride);
} else if (0 == height % 8) {
- v16i8 src0, src1, src2, src3;
- v8i16 in0, in1, in2, in3;
- v8i16 dst0, dst1, dst2, dst3;
uint32_t loop_cnt;
for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
- src0_ptr += (4 * src_stride);
- LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
- src1_ptr += (4 * src2_stride);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
-
- SLLI_4V(dst0, dst1, dst2, dst3, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7,
- dst0, dst1, dst2, dst3);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
- src0_ptr += (4 * src_stride);
- LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
- src1_ptr += (4 * src2_stride);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
-
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += 4 * src_stride;
+ INSERT_D2_SB(tp0, tp1, src0);
+ INSERT_D2_SB(tp2, tp3, src1);
+ LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+ src0_ptr += 4 * src_stride;
+ INSERT_D2_SB(tp0, tp1, src2);
+ INSERT_D2_SB(tp2, tp3, src3);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
+ ILVRL_B2_SH(zero, src3, dst6, dst7);
+ LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
+ in7);
+ src1_ptr += (8 * src2_stride);
SLLI_4V(dst0, dst1, dst2, dst3, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7,
- dst0, dst1, dst2, dst3);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
+ SLLI_4V(dst4, dst5, dst6, dst7, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
+ dst3, 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
+ dst7, 7, dst4, dst5, dst6, dst7);
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+ PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+ ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+ dst += (8 * dst_stride);
}
}
}
@@ -275,12 +264,13 @@ static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
int32_t height)
{
uint32_t loop_cnt;
+ v16i8 zero = { 0 };
+ v16u8 out0, out1, out2;
v16i8 src0, src1, src2, src3;
v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
- v16i8 zero = { 0 };
- for (loop_cnt = (16 >> 2); loop_cnt--;) {
+ for (loop_cnt = 4; loop_cnt--;) {
LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
src0_ptr += (4 * src_stride);
@@ -288,88 +278,21 @@ static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
src1_ptr += (4 * src2_stride);
ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0, dst1, dst2, dst3);
-
+ ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
+ dst2, dst3);
SLLI_4V(dst0, dst1, dst2, dst3, 6);
ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
- dst4 <<= 6;
- dst5 <<= 6;
- HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
- dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
- HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
-
- PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+ SLLI_2V(dst4, dst5, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
+ PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+ ST12x4_UB(out0, out1, out2, dst, dst_stride);
dst += (4 * dst_stride);
}
}
-static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr,
- int32_t src_stride,
- int16_t *src1_ptr,
- int32_t src2_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int32_t height,
- int32_t width)
-{
- uint32_t loop_cnt;
- uint32_t cnt;
- uint8_t *src0_ptr_tmp;
- int16_t *src1_ptr_tmp;
- uint8_t *dst_tmp;
- v16i8 zero = { 0 };
-
- for (cnt = (width >> 4); cnt--;) {
- src0_ptr_tmp = src0_ptr;
- src1_ptr_tmp = src1_ptr;
- dst_tmp = dst;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- v16i8 src0, src1, src2, src3;
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
- v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
-
- LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
- src0_ptr_tmp += (4 * src_stride);
- LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
- LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
- src1_ptr_tmp += (4 * src2_stride);
-
- ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0_r, dst1_r, dst2_r, dst3_r);
- ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
- dst0_l, dst1_l, dst2_l, dst3_l);
-
- SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
- SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
- HEVC_BI_RND_CLIP4(in0, in1, in4, in5,
- dst0_r, dst1_r, dst0_l, dst1_l, 7,
- dst0_r, dst1_r, dst0_l, dst1_l);
-
- PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
- ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
- dst_tmp += (2 * dst_stride);
-
- HEVC_BI_RND_CLIP4(in2, in3, in6, in7,
- dst2_r, dst3_r, dst2_l, dst3_l, 7,
- dst2_r, dst3_r, dst2_l, dst3_l);
-
- PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
- ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
- dst_tmp += (2 * dst_stride);
- }
-
- src0_ptr += 16;
- src1_ptr += 16;
- dst += 16;
- }
-}
-
static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
int32_t src_stride,
int16_t *src1_ptr,
@@ -378,8 +301,34 @@ static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
- hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
- dst, dst_stride, height, 16);
+ uint32_t loop_cnt;
+ v16u8 out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+ v16i8 zero = { 0 };
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+ src0_ptr += (4 * src_stride);
+ LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+ LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+ src1_ptr += (4 * src2_stride);
+ ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
+ ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
+ ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
+ ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
+ SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+ SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
+ dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
+ dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
+ PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
+ PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
+ ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
}
static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
@@ -390,11 +339,42 @@ static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
- hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
- dst, dst_stride, height, 16);
+ uint32_t loop_cnt;
+ v16u8 out0, out1, out2, out3, out4, out5;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
+ v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
+ LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
+ src0_ptr += (4 * src_stride);
+ LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+ LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+ LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
+ src1_ptr += (4 * src2_stride);
- hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
- dst + 16, dst_stride, height);
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
+ ILVRL_B2_SH(zero, src4, dst6, dst7);
+ ILVRL_B2_SH(zero, src5, dst8, dst9);
+ ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
+ SLLI_4V(dst0, dst1, dst2, dst3, 6);
+ SLLI_4V(dst4, dst5, dst6, dst7, 6);
+ SLLI_4V(dst8, dst9, dst10, dst11, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
+ 7, dst4, dst5, dst6, dst7);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
+ dst11, 7, dst8, dst9, dst10, dst11);
+ PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+ PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+ ST_UB4(out0, out1, out3, out4, dst, dst_stride);
+ ST8x4_UB(out2, out5, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
}
static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
@@ -405,8 +385,40 @@ static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
- hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
- dst, dst_stride, height, 32);
+ uint32_t loop_cnt;
+ v16u8 out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 zero = { 0 };
+ v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_SB2(src0_ptr, 16, src0, src1);
+ src0_ptr += src_stride;
+ LD_SB2(src0_ptr, 16, src2, src3);
+ src0_ptr += src_stride;
+ LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+ src1_ptr += src2_stride;
+ LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
+ src1_ptr += src2_stride;
+
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
+ ILVRL_B2_SH(zero, src3, dst6, dst7);
+ SLLI_4V(dst0, dst1, dst2, dst3, 6);
+ SLLI_4V(dst4, dst5, dst6, dst7, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+ 7, dst4, dst5, dst6, dst7);
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+ PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+ ST_UB2(out0, out1, dst, 16);
+ dst += dst_stride;
+ ST_UB2(out2, out3, dst, 16);
+ dst += dst_stride;
+ }
}
static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
@@ -417,8 +429,50 @@ static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
- hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
- dst, dst_stride, height, 48);
+ uint32_t loop_cnt;
+ v16u8 out0, out1, out2, out3, out4, out5;
+ v16i8 src0, src1, src2, src3, src4, src5;
+ v16i8 zero = { 0 };
+ v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_SB3(src0_ptr, 16, src0, src1, src2);
+ src0_ptr += src_stride;
+ LD_SB3(src0_ptr, 16, src3, src4, src5);
+ src0_ptr += src_stride;
+
+ LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
+ src1_ptr += src2_stride;
+ LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
+ src1_ptr += src2_stride;
+
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
+ ILVRL_B2_SH(zero, src3, dst6, dst7);
+ ILVRL_B2_SH(zero, src4, dst8, dst9);
+ ILVRL_B2_SH(zero, src5, dst10, dst11);
+
+ SLLI_4V(dst0, dst1, dst2, dst3, 6);
+ SLLI_4V(dst4, dst5, dst6, dst7, 6);
+ SLLI_4V(dst8, dst9, dst10, dst11, 6);
+
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+ 7, dst4, dst5, dst6, dst7);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
+ dst11, 7, dst8, dst9, dst10, dst11);
+ PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+ PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+ ST_UB2(out0, out1, dst, 16);
+ ST_UB(out2, dst + 32);
+ dst += dst_stride;
+ ST_UB2(out3, out4, dst, 16);
+ ST_UB(out5, dst + 32);
+ dst += dst_stride;
+ }
}
static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
@@ -429,8 +483,35 @@ static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
int32_t dst_stride,
int32_t height)
{
- hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
- dst, dst_stride, height, 64);
+ uint32_t loop_cnt;
+ v16u8 out0, out1, out2, out3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 zero = { 0 };
+ v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
+ src0_ptr += src_stride;
+ LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ src1_ptr += src2_stride;
+
+ ILVRL_B2_SH(zero, src0, dst0, dst1);
+ ILVRL_B2_SH(zero, src1, dst2, dst3);
+ ILVRL_B2_SH(zero, src2, dst4, dst5);
+ ILVRL_B2_SH(zero, src3, dst6, dst7);
+ SLLI_4V(dst0, dst1, dst2, dst3, 6);
+ SLLI_4V(dst4, dst5, dst6, dst7, 6);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ 7, dst0, dst1, dst2, dst3);
+ HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+ 7, dst4, dst5, dst6, dst7);
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+ PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+
+ ST_UB4(out0, out1, out2, out3, dst, 16);
+ dst += dst_stride;
+ }
}
static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
@@ -4368,22 +4449,22 @@ BI_MC_COPY(64);
#undef BI_MC_COPY
-#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
-void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- uint8_t *src, \
- ptrdiff_t src_stride, \
- int16_t *src_16bit, \
- int height, \
- intptr_t mx, \
- intptr_t my, \
- int width) \
-{ \
- const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
- \
- hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
- MAX_PB_SIZE, dst, dst_stride, \
- filter, height); \
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter, height); \
}
BI_MC(qpel, h, 4, 8, hz, mx);
@@ -4422,41 +4503,40 @@ BI_MC(epel, v, 32, 4, vt, my);
#undef BI_MC
-#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
-void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
- ptrdiff_t dst_stride, \
- uint8_t *src, \
- ptrdiff_t src_stride, \
- int16_t *src_16bit, \
- int height, \
- intptr_t mx, \
- intptr_t my, \
- int width) \
-{ \
- const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
- const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
- \
- hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
- MAX_PB_SIZE, dst, dst_stride, \
- filter_x, filter_y, \
- height); \
+#define BI_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter_x, filter_y, height); \
}
-BI_MC_HV(qpel, hv, 4, 8, hv);
-BI_MC_HV(qpel, hv, 8, 8, hv);
-BI_MC_HV(qpel, hv, 12, 8, hv);
-BI_MC_HV(qpel, hv, 16, 8, hv);
-BI_MC_HV(qpel, hv, 24, 8, hv);
-BI_MC_HV(qpel, hv, 32, 8, hv);
-BI_MC_HV(qpel, hv, 48, 8, hv);
-BI_MC_HV(qpel, hv, 64, 8, hv);
-
-BI_MC_HV(epel, hv, 4, 4, hv);
-BI_MC_HV(epel, hv, 8, 4, hv);
-BI_MC_HV(epel, hv, 6, 4, hv);
-BI_MC_HV(epel, hv, 12, 4, hv);
-BI_MC_HV(epel, hv, 16, 4, hv);
-BI_MC_HV(epel, hv, 24, 4, hv);
-BI_MC_HV(epel, hv, 32, 4, hv);
+BI_MC_HV(qpel, 4, 8);
+BI_MC_HV(qpel, 8, 8);
+BI_MC_HV(qpel, 12, 8);
+BI_MC_HV(qpel, 16, 8);
+BI_MC_HV(qpel, 24, 8);
+BI_MC_HV(qpel, 32, 8);
+BI_MC_HV(qpel, 48, 8);
+BI_MC_HV(qpel, 64, 8);
+
+BI_MC_HV(epel, 4, 4);
+BI_MC_HV(epel, 8, 4);
+BI_MC_HV(epel, 6, 4);
+BI_MC_HV(epel, 12, 4);
+BI_MC_HV(epel, 16, 4);
+BI_MC_HV(epel, 24, 4);
+BI_MC_HV(epel, 32, 4);
#undef BI_MC_HV
More information about the ffmpeg-cvslog
mailing list