[FFmpeg-devel] [PATCH v2 3/7] avcodec/la: Add LSX optimization for h264 chroma and intrapred.
Hao Chen
chenhao at loongson.cn
Wed May 17 10:03:47 EEST 2023
From: Lu Wang <wanglu at loongson.cn>
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 199fps
after: 214fps
---
libavcodec/loongarch/Makefile | 4 +-
.../loongarch/h264_intrapred_init_loongarch.c | 18 +-
libavcodec/loongarch/h264_intrapred_lasx.c | 121 --
...pred_lasx.h => h264_intrapred_loongarch.h} | 12 +-
libavcodec/loongarch/h264chroma.S | 966 +++++++++++++
.../loongarch/h264chroma_init_loongarch.c | 10 +-
libavcodec/loongarch/h264chroma_lasx.c | 1280 -----------------
libavcodec/loongarch/h264chroma_lasx.h | 36 -
libavcodec/loongarch/h264chroma_loongarch.h | 41 +
libavcodec/loongarch/h264intrapred.S | 299 ++++
10 files changed, 1342 insertions(+), 1445 deletions(-)
delete mode 100644 libavcodec/loongarch/h264_intrapred_lasx.c
rename libavcodec/loongarch/{h264_intrapred_lasx.h => h264_intrapred_loongarch.h} (70%)
create mode 100644 libavcodec/loongarch/h264chroma.S
delete mode 100644 libavcodec/loongarch/h264chroma_lasx.c
delete mode 100644 libavcodec/loongarch/h264chroma_lasx.h
create mode 100644 libavcodec/loongarch/h264chroma_loongarch.h
create mode 100644 libavcodec/loongarch/h264intrapred.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 111bc23e4e..a563055161 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -9,11 +9,9 @@ OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
-LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
loongarch/h264_deblock_lasx.o
-LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
LASX-OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_lasx.o
LASX-OBJS-$(CONFIG_IDCTDSP) += loongarch/simple_idct_lasx.o \
@@ -33,3 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
+LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264_intrapred_init_loongarch.c b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
index 12620bd842..c415fa30da 100644
--- a/libavcodec/loongarch/h264_intrapred_init_loongarch.c
+++ b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
@@ -21,7 +21,7 @@
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264pred.h"
-#include "h264_intrapred_lasx.h"
+#include "h264_intrapred_loongarch.h"
av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
const int bit_depth,
@@ -30,6 +30,22 @@ av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
+ if (have_lsx(cpu_flags)) {
+ if (chroma_format_idc <= 1) {
+ }
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ } else {
+ if (chroma_format_idc <= 1) {
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
+ }
+ }
+ }
if (have_lasx(cpu_flags)) {
if (chroma_format_idc <= 1) {
}
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.c b/libavcodec/loongarch/h264_intrapred_lasx.c
deleted file mode 100644
index c38cd611b8..0000000000
--- a/libavcodec/loongarch/h264_intrapred_lasx.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Hao Chen <chenhao at loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264_intrapred_lasx.h"
-
-#define PRED16X16_PLANE \
- ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6; \
- ptrdiff_t stride_8, stride_15; \
- int32_t res0, res1, res2, res3, cnt; \
- uint8_t *src0, *src1; \
- __m256i reg0, reg1, reg2, reg3, reg4; \
- __m256i tmp0, tmp1, tmp2, tmp3; \
- __m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0}; \
- __m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0}; \
- __m256i int_mult1 = {0x0000000100000000, 0x0000000300000002, \
- 0x0000000500000004, 0x0000000700000006}; \
- \
- stride_1 = -stride; \
- stride_2 = stride << 1; \
- stride_3 = stride_2 + stride; \
- stride_4 = stride_2 << 1; \
- stride_5 = stride_4 + stride; \
- stride_6 = stride_3 << 1; \
- stride_8 = stride_4 << 1; \
- stride_15 = (stride_8 << 1) - stride; \
- src0 = src - 1; \
- src1 = src0 + stride_8; \
- \
- reg0 = __lasx_xvldx(src0, -stride); \
- reg1 = __lasx_xvldx(src, (8 - stride)); \
- reg0 = __lasx_xvilvl_d(reg1, reg0); \
- reg0 = __lasx_xvshuf_b(reg0, reg0, shuff); \
- reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0); \
- reg0 = __lasx_xvmul_h(reg0, mult); \
- res1 = (src1[0] - src0[stride_6]) + \
- 2 * (src1[stride] - src0[stride_5]) + \
- 3 * (src1[stride_2] - src0[stride_4]) + \
- 4 * (src1[stride_3] - src0[stride_3]) + \
- 5 * (src1[stride_4] - src0[stride_2]) + \
- 6 * (src1[stride_5] - src0[stride]) + \
- 7 * (src1[stride_6] - src0[0]) + \
- 8 * (src0[stride_15] - src0[stride_1]); \
- reg0 = __lasx_xvhaddw_w_h(reg0, reg0); \
- reg0 = __lasx_xvhaddw_d_w(reg0, reg0); \
- reg0 = __lasx_xvhaddw_q_d(reg0, reg0); \
- res0 = __lasx_xvpickve2gr_w(reg0, 0); \
-
-#define PRED16X16_PLANE_END \
- res2 = (src0[stride_15] + src[15 - stride] + 1) << 4; \
- res3 = 7 * (res0 + res1); \
- res2 -= res3; \
- reg0 = __lasx_xvreplgr2vr_w(res0); \
- reg1 = __lasx_xvreplgr2vr_w(res1); \
- reg2 = __lasx_xvreplgr2vr_w(res2); \
- reg3 = __lasx_xvmul_w(reg0, int_mult1); \
- reg4 = __lasx_xvslli_w(reg0, 3); \
- reg4 = __lasx_xvadd_w(reg4, reg3); \
- for (cnt = 8; cnt--;) { \
- tmp0 = __lasx_xvadd_w(reg2, reg3); \
- tmp1 = __lasx_xvadd_w(reg2, reg4); \
- tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5); \
- tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); \
- reg2 = __lasx_xvadd_w(reg2, reg1); \
- tmp2 = __lasx_xvadd_w(reg2, reg3); \
- tmp3 = __lasx_xvadd_w(reg2, reg4); \
- tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5); \
- tmp1 = __lasx_xvpermi_d(tmp1, 0xD8); \
- tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0); \
- reg2 = __lasx_xvadd_w(reg2, reg1); \
- __lasx_xvstelm_d(tmp0, src, 0, 0); \
- __lasx_xvstelm_d(tmp0, src, 8, 2); \
- src += stride; \
- __lasx_xvstelm_d(tmp0, src, 0, 1); \
- __lasx_xvstelm_d(tmp0, src, 8, 3); \
- src += stride; \
- }
-
-
-void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- res0 = (5 * res0 + 32) >> 6;
- res1 = (5 * res1 + 32) >> 6;
- PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- res0 = (res0 + (res0 >> 2)) >> 4;
- res1 = (res1 + (res1 >> 2)) >> 4;
- PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- cnt = (5 * (res0/4)) / 16;
- res0 = (5 * (res1/4)) / 16;
- res1 = cnt;
- PRED16X16_PLANE_END
-}
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.h b/libavcodec/loongarch/h264_intrapred_loongarch.h
similarity index 70%
rename from libavcodec/loongarch/h264_intrapred_lasx.h
rename to libavcodec/loongarch/h264_intrapred_loongarch.h
index 0c2653300c..39be87ee9f 100644
--- a/libavcodec/loongarch/h264_intrapred_lasx.h
+++ b/libavcodec/loongarch/h264_intrapred_loongarch.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Hao Chen <chenhao at loongson.cn>
*
* This file is part of FFmpeg.
@@ -19,13 +19,17 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
-#define AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
#include "libavcodec/avcodec.h"
+void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
+
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
-#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264chroma.S b/libavcodec/loongarch/h264chroma.S
new file mode 100644
index 0000000000..353b8d004b
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma.S
@@ -0,0 +1,966 @@
+/*
+ * Loongson LSX/LASX optimized h264chroma
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOP_D
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.LOOP_D:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vilvl.b vr11, vr8, vr7
+ vilvl.b vr12, vr6, vr5
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vilvl.b vr11, vr8, vr7
+ vilvl.b vr12, vr6, vr5
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_D
+ b .ENDLOOP
+.ENDLOOP_D:
+
+ bge zero, t0, .ENDLOOP_E
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.LOOP_E:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_E
+ b .ENDLOOP
+.ENDLOOP_E:
+
+ move t1, a3
+.LOOP:
+ vld vr5, a1, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, a2
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t2
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t3
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP
+.ENDLOOP:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOPD
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.LOOPD:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr6, vr5
+ vilvl.b vr13, vr8, vr7
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr13, vr6, vr5
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr6, vr5
+ vilvl.b vr13, vr8, vr7
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr13, vr6, vr5
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPD
+ b .ENDLOOPELSE
+.ENDLOOPD:
+
+ bge zero, t0, .ENDLOOPE
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.LOOPE:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPE
+ b .ENDLOOPELSE
+.ENDLOOPE:
+
+ move t1, a3
+.LOOPELSE:
+ vld vr5, a1, 0
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, a2
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t2
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t3
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPELSE
+.ENDLOOPELSE:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ slli.d t8, a2, 1
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+
+ bge zero, t6, .ENDPUT_D
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.PUT_D:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ add.d a1, a1, a2
+ vld vr11, a1, 0
+ vld vr12, a1, 1
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr13, vr12, vr11
+ vilvl.d vr5, vr7, vr5
+ vilvl.d vr13, vr13, vr7
+ vmulwev.h.bu vr14, vr9, vr5
+ vmaddwod.h.bu vr14, vr9, vr5
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrarni.b.h vr14, vr14, 6
+ vstelm.w vr14, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr14, a0, 0, 1
+ add.d a0, a0, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_D
+ b .ENDPUT
+.ENDPUT_D:
+
+ bge zero, t0, .ENDPUT_E
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.PUT_E:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ add.d a1, a1, a2
+ vld vr8, a1, 0
+ vldx vr9, a1, t7
+ vilvl.b vr8, vr9, vr8
+ vilvl.d vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_E
+ b .ENDPUT
+.ENDPUT_E:
+
+ move t1, a3
+.PUT:
+ vld vr5, a1, 0
+ vldx vr8, a1, a2
+ vilvl.w vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, t8
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT
+.ENDPUT:
+endfunc
+
+/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ xvreplgr2vr.b xr0, t3
+ xvreplgr2vr.b xr1, t4
+ xvreplgr2vr.b xr2, t5
+ xvreplgr2vr.b xr3, t6
+ xvreplgr2vr.b xr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOP_DA
+ move t1, a3
+ xvilvl.b xr9, xr1, xr0
+ xvilvl.b xr10, xr3, xr2
+.LOOP_DA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fld.d f14, a1, 1
+ add.d a1, a1, a2
+ fld.d f15, a1, 0
+ fld.d f16, a1, 1
+ add.d a1, a1, a2
+ fld.d f17, a1, 0
+ fld.d f18, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr14, vr14, vr13
+ vilvl.b vr15, vr16, vr15
+ vilvl.b vr16, vr18, vr17
+ xvpermi.q xr11, xr12, 0x02
+ xvpermi.q xr12, xr14, 0x02
+ xvpermi.q xr14, xr15, 0x02
+ xvpermi.q xr15, xr16, 0x02
+
+ xvmulwev.h.bu xr19, xr9, xr11
+ xvmaddwod.h.bu xr19, xr9, xr11
+ xvmulwev.h.bu xr20, xr10, xr12
+ xvmaddwod.h.bu xr20, xr10, xr12
+ xvadd.h xr21, xr19, xr20
+ xvsrarni.b.h xr21, xr21, 6
+ vstelm.d vr21, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr21, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr13, xr9, xr14
+ xvmaddwod.h.bu xr13, xr9, xr14
+ xvmulwev.h.bu xr14, xr10, xr15
+ xvmaddwod.h.bu xr14, xr10, xr15
+ xvadd.h xr13, xr13, xr14
+ xvsrarni.b.h xr13, xr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr13, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_DA
+ b .ENDLOOPA
+.ENDLOOP_DA:
+
+ bge zero, t0, .ENDLOOP_EA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ xvilvl.b xr7, xr4, xr0
+.LOOP_EA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ add.d a1, a1, a2
+ fld.d f9, a1, 0
+ fldx.d f10, a1, t7
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fldx.d f12, a1, t7
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fldx.d f14, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr9, vr10, vr9
+ vilvl.b vr11, vr12, vr11
+ vilvl.b vr13, vr14, vr13
+ xvpermi.q xr5, xr9, 0x02
+ xvpermi.q xr11, xr13, 0x02
+
+ xvmulwev.h.bu xr8, xr7, xr5
+ xvmaddwod.h.bu xr8, xr7, xr5
+ xvmulwev.h.bu xr6, xr7, xr11
+ xvmaddwod.h.bu xr6, xr7, xr11
+ xvsrarni.b.h xr8, xr8, 6
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+ xvsrarni.b.h xr6, xr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr6, a0, 0, 2
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_EA
+ b .ENDLOOPA
+.ENDLOOP_EA:
+
+ move t1, a3
+.LOOPA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, a2
+ fldx.d f7, a1, t2
+ fldx.d f8, a1, t3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ xvpermi.q xr5, xr7, 0x02
+ xvmulwev.h.bu xr6, xr0, xr5
+ xvmulwod.h.bu xr7, xr0, xr5
+ xvilvl.h xr8, xr7, xr6
+ xvilvh.h xr9, xr7, xr6
+ xvsrarni.b.h xr9, xr8, 6
+ vstelm.d vr9, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr9, a0, 0, 1
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 2
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 3
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPA
+.ENDLOOPA:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ xvreplgr2vr.b xr0, t3
+ xvreplgr2vr.b xr1, t4
+ xvreplgr2vr.b xr2, t5
+ xvreplgr2vr.b xr3, t6
+ xvreplgr2vr.b xr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOPDA
+ move t1, a3
+ xvilvl.b xr9, xr1, xr0
+ xvilvl.b xr10, xr3, xr2
+.LOOPDA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fld.d f12, a1, 1
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fld.d f14, a1, 1
+ add.d a1, a1, a2
+ fld.d f15, a1, 0
+ fld.d f16, a1, 1
+ fld.d f17, a0, 0
+ fldx.d f18, a0, a2
+ fldx.d f19, a0, t2
+ fldx.d f20, a0, t3
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr11, vr12, vr11
+ vilvl.b vr13, vr14, vr13
+ vilvl.b vr16, vr16, vr15
+ xvpermi.q xr5, xr7, 0x02
+ xvpermi.q xr7, xr11, 0x02
+ xvpermi.q xr11, xr13, 0x02
+ xvpermi.q xr13, xr16, 0x02
+ xvpermi.q xr17, xr18, 0x02
+ xvpermi.q xr19, xr20, 0x02
+
+ xvmulwev.h.bu xr14, xr9, xr5
+ xvmaddwod.h.bu xr14, xr9, xr5
+ xvmulwev.h.bu xr15, xr10, xr7
+ xvmaddwod.h.bu xr15, xr10, xr7
+ xvadd.h xr14, xr14, xr15
+ xvsrari.h xr14, xr14, 6
+ xvsllwil.hu.bu xr17, xr17, 0
+ xvadd.h xr20, xr14, xr17
+ xvsrarni.b.h xr20, xr20, 1
+ xvstelm.d xr20, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr20, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr14, xr9, xr11
+ xvmaddwod.h.bu xr14, xr9, xr11
+ xvmulwev.h.bu xr15, xr10, xr13
+ xvmaddwod.h.bu xr15, xr10, xr13
+ xvadd.h xr14, xr14, xr15
+ xvsrari.h xr14, xr14, 6
+ xvsllwil.hu.bu xr19, xr19, 0
+ xvadd.h xr21, xr14, xr19
+ xvsrarni.b.h xr21, xr21, 1
+ xvstelm.d xr21, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr21, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPDA
+ b .ENDLOOPELSEA
+.ENDLOOPDA:
+
+ bge zero, t0, .ENDLOOPEA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ xvilvl.b xr7, xr4, xr0
+.LOOPEA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ add.d a1, a1, a2
+ fld.d f8, a1, 0
+ fldx.d f9, a1, t7
+ add.d a1, a1, a2
+ fld.d f10, a1, 0
+ fldx.d f11, a1, t7
+ add.d a1, a1, a2
+ fld.d f12, a1, 0
+ fldx.d f13, a1, t7
+ add.d a1, a1, a2
+ fld.d f14, a0, 0
+ fldx.d f15, a0, a2
+ fldx.d f16, a0, t2
+ fldx.d f17, a0, t3
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr8, vr9, vr8
+ vilvl.b vr10, vr11, vr10
+ vilvl.b vr12, vr13, vr12
+ xvpermi.q xr5, xr8, 0x02
+ xvpermi.q xr10, xr12, 0x02
+ xvpermi.q xr14, xr15, 0x02
+ xvpermi.q xr16, xr17, 0x02
+
+ xvmulwev.h.bu xr6, xr7, xr5
+ xvmaddwod.h.bu xr6, xr7, xr5
+ xvsrari.h xr6, xr6, 6
+ xvsllwil.hu.bu xr14, xr14, 0
+ xvadd.h xr8, xr6, xr14
+ xvsrarni.b.h xr8, xr8, 1
+ xvstelm.d xr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr6, xr7, xr10
+ xvmaddwod.h.bu xr6, xr7, xr10
+ xvsrari.h xr6, xr6, 6
+ xvsllwil.hu.bu xr16, xr16, 0
+ xvadd.h xr8, xr6, xr16
+ xvsrarni.b.h xr8, xr8, 1
+ xvstelm.d xr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPEA
+ b .ENDLOOPELSEA
+.ENDLOOPEA:
+
+ move t1, a3
+.LOOPELSEA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, a2
+ fldx.d f7, a1, t2
+ fldx.d f8, a1, t3
+ fld.d f9, a0, 0
+ fldx.d f10, a0, a2
+ fldx.d f11, a0, t2
+ fldx.d f12, a0, t3
+ xvpermi.q xr5, xr6, 0x02
+ xvpermi.q xr7, xr8, 0x02
+ xvpermi.q xr9, xr10, 0x02
+ xvpermi.q xr11, xr12, 0x02
+
+ xvmulwev.h.bu xr12, xr0, xr5
+ xvmulwod.h.bu xr13, xr0, xr5
+ xvilvl.h xr12, xr13, xr12
+ xvsrari.h xr12, xr12, 6
+ xvsllwil.hu.bu xr9, xr9, 0
+ xvadd.h xr9, xr12, xr9
+ xvsrarni.b.h xr9, xr9, 1
+ xvstelm.d xr9, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr12, xr0, xr7
+ xvmulwod.h.bu xr13, xr0, xr7
+ xvilvl.h xr12, xr13, xr12
+ xvsrari.h xr12, xr12, 6
+ xvsllwil.hu.bu xr11, xr11, 0
+ xvadd.h xr13, xr12, xr11
+ xvsrarni.b.h xr13, xr13, 1
+ xvstelm.d xr13, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr13, a0, 0, 2
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPELSEA
+.ENDLOOPELSEA:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ slli.d t8, a2, 1
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+
+ bge zero, t6, .ENDPUT_DA
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.PUT_DA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fld.d f12, a1, 1
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr13, vr12, vr11
+ vilvl.d vr5, vr7, vr5
+ vilvl.d vr13, vr13, vr7
+ vmulwev.h.bu vr14, vr9, vr5
+ vmaddwod.h.bu vr14, vr9, vr5
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ xvadd.h xr14, xr14, xr15
+ vsrarni.b.h vr16, vr14, 6
+ vstelm.w vr16, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr16, a0, 0, 1
+ add.d a0, a0, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_DA
+ b .ENDPUTA
+.ENDPUT_DA:
+
+ bge zero, t0, .ENDPUT_EA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.PUT_EA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ add.d a1, a1, a2
+ fld.d f8, a1, 0
+ fldx.d f9, a1, t7
+ vilvl.b vr8, vr9, vr8
+ vilvl.d vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_EA
+ b .ENDPUTA
+.ENDPUT_EA:
+
+ move t1, a3
+.PUTA:
+ fld.d f5, a1, 0
+ fldx.d f8, a1, a2
+ vilvl.w vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, t8
+ addi.d t1, t1, -2
+ blt zero, t1, .PUTA
+.ENDPUTA:
+endfunc
diff --git a/libavcodec/loongarch/h264chroma_init_loongarch.c b/libavcodec/loongarch/h264chroma_init_loongarch.c
index 0ca24ecc47..40a957aad3 100644
--- a/libavcodec/loongarch/h264chroma_init_loongarch.c
+++ b/libavcodec/loongarch/h264chroma_init_loongarch.c
@@ -19,7 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264chroma_lasx.h"
+#include "h264chroma_loongarch.h"
#include "libavutil/attributes.h"
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264chroma.h"
@@ -27,6 +27,14 @@
av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth <= 8) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
+ }
+ }
+
if (have_lasx(cpu_flags)) {
if (bit_depth <= 8) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
diff --git a/libavcodec/loongarch/h264chroma_lasx.c b/libavcodec/loongarch/h264chroma_lasx.c
deleted file mode 100644
index 1c0e002bdf..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.c
+++ /dev/null
@@ -1,1280 +0,0 @@
-/*
- * Loongson LASX optimized h264chroma
- *
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf at loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "h264chroma_lasx.h"
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/loongarch/loongson_intrinsics.h"
-
-static const uint8_t chroma_mask_arr[64] = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
-};
-
-static av_always_inline void avc_chroma_hv_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride_2x << 1;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
- src8, src7, 0x20, src1, src3, src5, src7);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
- src7, mask, src1, src3, src5, src7);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
- coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
- res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
- res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
- DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1,
- res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1,
- res_vt0, res_vt1, res_vt2, res_vt3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2);
- src3 = __lasx_xvldx(src, stride_3x);
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-
-}
-
-static av_always_inline void avc_chroma_hz_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6);
- src7 = __lasx_xvldx(src, stride_3x);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
- src7, src6, 0x20, src0, src2, src4, src6);
- DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask,
- src6, src6, mask, src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_nonmult_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- uint32_t row;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- mask = __lasx_xvld(chroma_mask_arr, 0);
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-
- for (row = height >> 2; row--;) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- src += stride_4x;
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
- dst += stride_4x;
- }
-
- if ((height & 3)) {
- src0 = __lasx_xvld(src, 0);
- src1 = __lasx_xvldx(src, stride);
- src1 = __lasx_xvpermi_q(src1, src0, 0x20);
- src0 = __lasx_xvshuf_b(src1, src1, mask);
- res0 = __lasx_xvdp2_h_bu(src0, coeff_vec);
- out = __lasx_xvssrarni_bu_h(res0, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- dst += stride;
- __lasx_xvstelm_d(out, dst, 0, 2);
- }
-}
-
-static av_always_inline void avc_chroma_vt_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
- src8, src7, 0x20, src4, src5, src6, src7);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
- src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec,
- src6, coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void copy_width8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- uint64_t tmp[8];
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "ld.d %[tmp0], %[src], 0x0 \n\t"
- "ldx.d %[tmp1], %[src], %[stride] \n\t"
- "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "ld.d %[tmp4], %[src], 0x0 \n\t"
- "ldx.d %[tmp5], %[src], %[stride] \n\t"
- "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
-
- "st.d %[tmp0], %[dst], 0x0 \n\t"
- "stx.d %[tmp1], %[dst], %[stride] \n\t"
- "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
- "add.d %[dst], %[dst], %[stride_4] \n\t"
- "st.d %[tmp4], %[dst], 0x0 \n\t"
- "stx.d %[tmp5], %[dst], %[stride] \n\t"
- "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
- : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
- [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
- [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
- [dst]"+&r"(dst), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4)
- : [stride]"r"(stride)
- : "memory"
- );
-}
-
-static av_always_inline void copy_width8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- uint64_t tmp[4];
- ptrdiff_t stride_2, stride_3;
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "ld.d %[tmp0], %[src], 0x0 \n\t"
- "ldx.d %[tmp1], %[src], %[stride] \n\t"
- "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
-
- "st.d %[tmp0], %[dst], 0x0 \n\t"
- "stx.d %[tmp1], %[dst], %[stride] \n\t"
- "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
- : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3)
- : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src)
- : "memory"
- );
-}
-
-static void avc_chroma_hv_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (8 == height) {
- avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- }
-}
-
-static void avc_chroma_hv_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- __m256i src0, src1, src2;
- __m256i res_hz, res_vt;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
- __m256i coeff_vt_vec = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1);
- src0 = __lasx_xvpermi_q(src0, src1, 0x02);
- res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec);
- res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec);
- res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01);
- res_vt = __lasx_xvadd_h(res_hz, res_vt);
- res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6);
- __lasx_xvstelm_w(res_vt, dst, 0, 0);
- __lasx_xvstelm_w(res_vt, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hv_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4;
- __m256i res_hz0, res_hz1, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
- src4, src3, mask, src0, src1, src2, src3);
- DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
- res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1);
- res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6);
- __lasx_xvstelm_w(res_hz0, dst, 0, 0);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hv_4x8_lasx(const uint8_t *src, uint8_t * dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
- src4, src3, mask, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask,
- src8, src7, mask, src4, src5, src6, src7);
- DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02,
- src5, src7, 0x02, src0, src1, src4, src5);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec,
- src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
- coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
- DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2);
- res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6);
- __lasx_xvstelm_w(res_hz0, dst, 0, 0);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res_hz0, dst, 0, 2);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hv_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (4 == height) {
- avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (2 == height) {
- avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- }
-}
-
-static void avc_chroma_hz_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- __m256i src0, src1;
- __m256i res, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- src1 = __lasx_xvldx(src, stride);
- src0 = __lasx_xvshuf_b(src1, src0, mask);
- res = __lasx_xvdp2_h_bu(src0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hz_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- __m256i src0, src1, src2, src3;
- __m256i res, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
- src3 = __lasx_xvldx(src, stride_3);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- res = __lasx_xvdp2_h_bu(src0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
- __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hz_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i res0, res1, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6);
- src7 = __lasx_xvldx(src, stride_3);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
- src7, src6, mask, src0, src2, src4, src6);
- DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1);
- res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_w(res0, dst, 0, 0);
- __lasx_xvstelm_w(res0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res0, dst, 0, 2);
- __lasx_xvstelm_w(res0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hz_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1);
- } else if (4 == height) {
- avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (2 == height) {
- avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_hz_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1);
- } else {
- avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height);
- }
-}
-
-static void avc_chroma_vt_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- __m256i src0, src1, src2;
- __m256i tmp0, tmp1;
- __m256i res;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- src0 = __lasx_xvld(src, 0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1);
- tmp0 = __lasx_xvilvl_d(tmp1, tmp0);
- res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_vt_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4;
- __m256i tmp0, tmp1, tmp2, tmp3;
- __m256i res;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
- tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
- res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
- __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_vt_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src5, src6, src7, src8);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- tmp0, tmp2, tmp4, tmp6);
- tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
- tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02);
- DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1);
- res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_w(res0, dst, 0, 0);
- __lasx_xvstelm_w(res0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res0, dst, 0, 2);
- __lasx_xvstelm_w(res0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_vt_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1);
- } else if (4 == height) {
- avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (2 == height) {
- avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_vt_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void copy_width4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
-
- if (8 == height) {
- ptrdiff_t stride_2, stride_3, stride_4;
-
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "ldx.wu %[tp2], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp3], %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "ld.wu %[tp4], %[src], 0 \n\t"
- "ldx.wu %[tp5], %[src], %[stride] \n\t"
- "ldx.wu %[tp6], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp7], %[src], %[stride_3] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- "stx.w %[tp2], %[dst], %[stride_2] \n\t"
- "stx.w %[tp3], %[dst], %[stride_3] \n\t"
- "add.d %[dst], %[dst], %[stride_4] \n\t"
- "st.w %[tp4], %[dst], 0 \n\t"
- "stx.w %[tp5], %[dst], %[stride] \n\t"
- "stx.w %[tp6], %[dst], %[stride_2] \n\t"
- "stx.w %[tp7], %[dst], %[stride_3] \n\t"
- : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4),
- [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
- [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5),
- [tp6]"+&r"(tp6), [tp7]"+&r"(tp7)
- : [stride]"r"(stride)
- : "memory"
- );
- } else if (4 == height) {
- ptrdiff_t stride_2, stride_3;
-
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "ldx.wu %[tp2], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp3], %[src], %[stride_3] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- "stx.w %[tp2], %[dst], %[stride_2] \n\t"
- "stx.w %[tp3], %[dst], %[stride_3] \n\t"
- : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3),
- [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
- [tp2]"+&r"(tp2), [tp3]"+&r"(tp3)
- : [stride]"r"(stride)
- : "memory"
- );
- } else if (2 == height) {
- __asm__ volatile (
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1)
- : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride)
- : "memory"
- );
- }
-}
-
-static void copy_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- if (8 == height) {
- copy_width8x8_lasx(src, dst, stride);
- } else if (4 == height) {
- copy_width8x4_lasx(src, dst, stride);
- }
-}
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if(x && y) {
- avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
- } else if (x) {
- avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height);
- } else if (y) {
- avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height);
- } else {
- copy_width4_lasx(src, dst, stride, height);
- }
-}
-
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if (!(x || y)) {
- copy_width8_lasx(src, dst, stride, height);
- } else if (x && y) {
- avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
- } else if (x) {
- avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height);
- } else {
- avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height);
- }
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
- src8, src7, 0x20, src1, src3, src5, src7);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
- src7, mask, src1, src3, src5, src7);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
- coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
- res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
- res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1);
- res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6,
- out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- mask = __lasx_xvld(chroma_mask_arr, 0);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- mask = __lasx_xvld(chroma_mask_arr, 0);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src4, src5, src6, src7);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
- src7, src6, 0x20, src0, src2, src4, src6);
- DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4,
- mask, src6, src6, mask, src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
- src8, src7, 0x20, src4, src5, src6, src7);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
- src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
-
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-
- src += stride_4x;
- dst += stride_4x;
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_chroma_hv_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride,
- uint32_t coef_hor0,
- uint32_t coef_hor1,
- uint32_t coef_ver0,
- uint32_t coef_ver1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0,
- coef_hor1, coef_ver0, coef_ver1);
- } else if (8 == height) {
- avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0,
- coef_hor1, coef_ver0, coef_ver1);
- }
-}
-
-static void avc_chroma_hz_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- if (4 == height) {
- avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_vt_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- if (4 == height) {
- avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avg_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- if (8 == height) {
- avg_width8x8_lasx(src, dst, stride);
- } else if (4 == height) {
- avg_width8x4_lasx(src, dst, stride);
- }
-}
-
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if (!(x || y)) {
- avg_width8_lasx(src, dst, stride, height);
- } else if (x && y) {
- avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y,
- (8 - y), height);
- } else if (x) {
- avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height);
- } else {
- avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height);
- }
-}
diff --git a/libavcodec/loongarch/h264chroma_lasx.h b/libavcodec/loongarch/h264chroma_lasx.h
deleted file mode 100644
index 633752035e..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf at loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-
-#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */
diff --git a/libavcodec/loongarch/h264chroma_loongarch.h b/libavcodec/loongarch/h264chroma_loongarch.h
new file mode 100644
index 0000000000..e65fcfe9f3
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_loongarch.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+
+#include "libavcodec/h264.h"
+
+void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */
diff --git a/libavcodec/loongarch/h264intrapred.S b/libavcodec/loongarch/h264intrapred.S
new file mode 100644
index 0000000000..a03f467b6e
--- /dev/null
+++ b/libavcodec/loongarch/h264intrapred.S
@@ -0,0 +1,299 @@
+/*
+ * Loongson LSX optimized h264intrapred
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const shufa
+.byte 6, 5, 4, 3, 2, 1, 0
+endconst
+
+const mulk
+.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
+endconst
+
+const mulh
+.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
+endconst
+
+.macro PRED16X16_PLANE
+ slli.d t6, a1, 1
+ slli.d t4, a1, 3
+ addi.d t0, a0, 7
+ sub.d t0, t0, a1
+ add.d t1, a0, t4
+ addi.d t1, t1, -1
+ sub.d t2, t1, t6
+
+ ld.bu t3, t0, 1
+ ld.bu t4, t0, -1
+ ld.bu t5, t1, 0
+ ld.bu t7, t2, 0
+ sub.d t3, t3, t4
+ sub.d t4, t5, t7
+
+ la.local t5, mulk
+ vld vr0, t5, 0
+ fld.d f1, t0, 2
+ fld.d f2, t0, -8
+ la.local t5, shufa
+ fld.d f3, t5, 0
+ vshuf.b vr2, vr2, vr2, vr3
+ vilvl.b vr1, vr1, vr2
+ vhsubw.hu.bu vr1, vr1, vr1
+ vmul.h vr0, vr0, vr1
+ vhaddw.w.h vr1, vr0, vr0
+ vhaddw.d.w vr0, vr1, vr1
+ vhaddw.q.d vr1, vr0, vr0
+ vpickve2gr.w t5, vr1, 0
+ add.d t3, t3, t5
+//2
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t5, t7, t8
+ slli.d t5, t5, 1
+
+//3&4
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 1
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t7, t7, 2
+ add.d t5, t5, t7
+
+//5&6
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 2
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t8, t7, 1
+ slli.d t7, t7, 2
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+
+//7&8
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t7, t7, 3
+ add.d t5, t5, t7
+ add.d t4, t4, t5
+ add.d t1, t1, a1
+.endm
+
+.macro PRED16X16_PLANE_END
+ ld.bu t7, t1, 0
+ ld.bu t8, t2, 16
+ add.d t5, t7, t8
+ addi.d t5, t5, 1
+ slli.d t5, t5, 4
+ add.d t7, t3, t4
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ sub.d t5, t5, t7
+
+ la.local t8, mulh
+ vld vr3, t8, 0
+ slli.d t8, t3, 3
+ vreplgr2vr.h vr4, t3
+ vreplgr2vr.h vr9, t8
+ vmul.h vr5, vr3, vr4
+
+.rept 16
+ move t7, t5
+ add.d t5, t5, t4
+ vreplgr2vr.h vr6, t7
+ vadd.h vr7, vr6, vr5
+ vadd.h vr8, vr9, vr7
+ vssrani.bu.h vr8, vr7, 5
+ vst vr8, a0, 0
+ add.d a0, a0, a1
+.endr
+.endm
+
+.macro PRED16X16_PLANE_END_LASX
+ ld.bu t7, t1, 0
+ ld.bu t8, t2, 16
+ add.d t5, t7, t8
+ addi.d t5, t5, 1
+ slli.d t5, t5, 4
+ add.d t7, t3, t4
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ sub.d t5, t5, t7
+
+ la.local t8, mulh
+ xvld xr3, t8, 0
+ xvreplgr2vr.h xr4, t3
+ xvmul.h xr5, xr3, xr4
+
+.rept 8
+ move t7, t5
+ add.d t5, t5, t4
+ xvreplgr2vr.h xr6, t7
+ xvreplgr2vr.h xr8, t5
+ add.d t5, t5, t4
+ xvadd.h xr7, xr6, xr5
+ xvadd.h xr9, xr8, xr5
+
+ xvssrani.bu.h xr9, xr7, 5
+ vstelm.d vr9, a0, 0, 0
+ xvstelm.d xr9, a0, 8, 2
+ add.d a0, a0, a1
+ vstelm.d vr9, a0, 0, 1
+ xvstelm.d xr9, a0, 8, 3
+ add.d a0, a0, a1
+.endr
+.endm
+
+/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lsx
+ PRED16X16_PLANE
+
+ slli.d t7, t3, 2
+ add.d t3, t3, t7
+ addi.d t3, t3, 32
+ srai.d t3, t3, 6
+ slli.d t7, t4, 2
+ add.d t4, t4, t7
+ addi.d t4, t4, 32
+ srai.d t4, t4, 6
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lsx
+ PRED16X16_PLANE
+
+ srai.d t7, t3, 2
+ add.d t3, t3, t7
+ srai.d t3, t3, 4
+ srai.d t7, t4, 2
+ add.d t4, t4, t7
+ srai.d t4, t4, 4
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lsx
+ PRED16X16_PLANE
+
+ li.d t6, 4
+ li.d t7, 5
+ li.d t8, 16
+ div.d t3, t3, t6
+ mul.d t3, t3, t7
+ div.d t3, t3, t8
+ div.d t4, t4, t6
+ mul.d t4, t4, t7
+ div.d t4, t4, t8
+ move t7, t3
+ move t3, t4
+ move t4, t7
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lasx
+ PRED16X16_PLANE
+
+ slli.d t7, t3, 2
+ add.d t3, t3, t7
+ addi.d t3, t3, 32
+ srai.d t3, t3, 6
+ slli.d t7, t4, 2
+ add.d t4, t4, t7
+ addi.d t4, t4, 32
+ srai.d t4, t4, 6
+
+ PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lasx
+ PRED16X16_PLANE
+
+ srai.d t7, t3, 2
+ add.d t3, t3, t7
+ srai.d t3, t3, 4
+ srai.d t7, t4, 2
+ add.d t4, t4, t7
+ srai.d t4, t4, 4
+
+ PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lasx
+ PRED16X16_PLANE
+
+ li.d t5, 4
+ li.d t7, 5
+ li.d t8, 16
+ div.d t3, t3, t5
+ mul.d t3, t3, t7
+ div.d t3, t3, t8
+ div.d t4, t4, t5
+ mul.d t4, t4, t7
+ div.d t4, t4, t8
+ move t7, t3
+ move t3, t4
+ move t4, t7
+
+ PRED16X16_PLANE_END_LASX
+endfunc
--
2.20.1
More information about the ffmpeg-devel
mailing list