[FFmpeg-cvslog] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
Lu Wang
git at videolan.org
Wed Mar 2 00:57:01 EET 2022
ffmpeg | branch: master | Lu Wang <wanglu at loongson.cn> | Thu Feb 17 19:11:49 2022 +0800| [b6ceeee16bebab698321cd03f7010701e92294b4] | committer: Michael Niedermayer
avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 110fps
after : 124fps
Signed-off-by: Hao Chen <chenhao at loongson.cn>
Reviewed-by: 殷时友 <yinshiyou-hf at loongson.cn>
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=b6ceeee16bebab698321cd03f7010701e92294b4
---
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/hevc_idct_lsx.c | 842 +++++++++
libavcodec/loongarch/hevc_lpf_sao_lsx.c | 2485 +++++++++++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 +
libavcodec/loongarch/hevcdsp_lsx.h | 26 +
5 files changed, 3374 insertions(+)
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 9f416d0c6e..cfc8e3aaff 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -26,3 +26,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
+ loongarch/hevc_idct_lsx.o \
+ loongarch/hevc_lpf_sao_lsx.o
diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c
new file mode 100644
index 0000000000..2193b27546
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct_lsx.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf at loongson.cn>
+ * Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+ 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+ 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+ 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+ 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+ 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+ 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+ 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+ 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+ 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+ 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+ 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+ 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+ 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+ 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+ 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+ 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+ 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+ 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+ 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+ 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+ 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+ 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+ 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+ 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+ 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ sum0, sum1, sum2, sum3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5; \
+ __m128i cnst64 = __lsx_vldi(0x0840); \
+ __m128i cnst83 = __lsx_vldi(0x0853); \
+ __m128i cnst36 = __lsx_vldi(0x0824); \
+ \
+ vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \
+ vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \
+ vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \
+ vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \
+ vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \
+ vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \
+ \
+ sum0 = __lsx_vadd_w(vec0, vec2); \
+ sum1 = __lsx_vsub_w(vec0, vec2); \
+ vec1 = __lsx_vadd_w(vec1, vec3); \
+ vec4 = __lsx_vsub_w(vec4, vec5); \
+ sum2 = __lsx_vsub_w(sum1, vec4); \
+ sum3 = __lsx_vsub_w(sum0, vec1); \
+ sum0 = __lsx_vadd_w(sum0, vec1); \
+ sum1 = __lsx_vadd_w(sum1, vec4); \
+ \
+ sum0 = __lsx_vsrari_w(sum0, shift); \
+ sum1 = __lsx_vsrari_w(sum1, shift); \
+ sum2 = __lsx_vsrari_w(sum2, shift); \
+ sum3 = __lsx_vsrari_w(sum3, shift); \
+ sum0 = __lsx_vsat_w(sum0, 15); \
+ sum1 = __lsx_vsat_w(sum1, 15); \
+ sum2 = __lsx_vsat_w(sum2, 15); \
+ sum3 = __lsx_vsat_w(sum3, 15); \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
+{ \
+ __m128i src0_r, src1_r, src2_r, src3_r; \
+ __m128i src0_l, src1_l, src2_l, src3_l; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
+ __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r; \
+ __m128i sum0_l, sum1_l, sum2_l, sum3_l; \
+ \
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_r, src1_r, src2_r, src3_r); \
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_l, src1_l, src2_l, src3_l); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \
+ filter, 12, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \
+ \
+ in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \
+ \
+ in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
+ src4_r, src5_r, src6_r, src7_r, \
+ src0_l, src1_l, src2_l, src3_l, \
+ src4_l, src5_l, src6_l, src7_l, shift) \
+{ \
+ int16_t *ptr0, *ptr1; \
+ __m128i dst0, dst1; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp0_l, temp1_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
+ __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \
+ \
+ ptr0 = (buf_ptr + 112); \
+ ptr1 = (buf_ptr + 128); \
+ k = -1; \
+ \
+ for (j = 0; j < 4; j++) \
+ { \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \
+ filter, 20, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \
+ sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \
+ sum3_r, sum3_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \
+ src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \
+ src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \
+ src6_l, filter3, sum3_r, sum3_l); \
+ \
+ sum1_r = sum0_r; \
+ sum1_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ filter += 16; \
+ DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \
+ temp0_r, temp0_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \
+ src6_l, filter2, sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \
+ temp1_r, temp1_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \
+ sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \
+ temp0_r, temp0_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \
+ src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \
+ src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ \
+ LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, buf_ptr, 0); \
+ __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \
+ \
+ LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ \
+ k *= -1; \
+ buf_ptr += 16; \
+ } \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
+{ \
+ tmp0_r = __lsx_vld(input + load_idx * 8, 0); \
+ tmp0_l = __lsx_vld(input + load_idx * 8, 16); \
+ tmp1_r = sum0_r; \
+ tmp1_l = sum0_l; \
+ sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \
+ __lsx_vst(sum0_r, (input + load_idx * 8), 0); \
+ __lsx_vst(sum0_l, (input + load_idx * 8), 16); \
+ tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \
+ tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \
+ __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \
+ __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ res0, res1, res2, res3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3; \
+ __m128i cnst74 = __lsx_vldi(0x84a); \
+ __m128i cnst55 = __lsx_vldi(0x837); \
+ __m128i cnst29 = __lsx_vldi(0x81d); \
+ \
+ vec0 = __lsx_vadd_w(in_r0, in_r1); \
+ vec2 = __lsx_vsub_w(in_r0, in_l1); \
+ res0 = __lsx_vmul_w(vec0, cnst29); \
+ res1 = __lsx_vmul_w(vec2, cnst55); \
+ res2 = __lsx_vsub_w(in_r0, in_r1); \
+ vec1 = __lsx_vadd_w(in_r1, in_l1); \
+ res2 = __lsx_vadd_w(res2, in_l1); \
+ vec3 = __lsx_vmul_w(in_l0, cnst74); \
+ res3 = __lsx_vmul_w(vec0, cnst55); \
+ \
+ res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \
+ res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \
+ res2 = __lsx_vmul_w(res2, cnst74); \
+ res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \
+ \
+ res0 = __lsx_vadd_w(res0, vec3); \
+ res1 = __lsx_vadd_w(res1, vec3); \
+ res3 = __lsx_vsub_w(res3, vec3); \
+ \
+ res0 = __lsx_vsrari_w(res0, shift); \
+ res1 = __lsx_vsrari_w(res1, shift); \
+ res2 = __lsx_vsrari_w(res2, shift); \
+ res3 = __lsx_vsrari_w(res3, shift); \
+ res0 = __lsx_vsat_w(res0, 15); \
+ res1 = __lsx_vsat_w(res1, 15); \
+ res2 = __lsx_vsat_w(res2, 15); \
+ res3 = __lsx_vsat_w(res3, 15); \
+}
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
+{
+ __m128i in0, in1;
+ __m128i in_r0, in_l0, in_r1, in_l1;
+ __m128i sum0, sum1, sum2, sum3;
+ __m128i zero = __lsx_vldi(0x00);
+
+ in0 = __lsx_vld(coeffs, 0);
+ in1 = __lsx_vld(coeffs, 16);
+ in_r0 = __lsx_vilvl_h(zero, in0);
+ in_l0 = __lsx_vilvh_h(zero, in0);
+ in_r1 = __lsx_vilvl_h(zero, in1);
+ in_l1 = __lsx_vilvh_h(zero, in1);
+
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+ LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+ /* Pack and transpose */
+ in0 = __lsx_vpickev_h(sum2, sum0);
+ in1 = __lsx_vpickev_h(sum3, sum1);
+ sum0 = __lsx_vilvl_h(in1, in0);
+ sum1 = __lsx_vilvh_h(in1, in0);
+ in0 = __lsx_vilvl_w(sum1, sum0);
+ in1 = __lsx_vilvh_w(sum1, sum0);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+}
+
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
+{
+ const int16_t *filter = >8x8_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
+ coeffs, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
+ coeffs, 112, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+ __lsx_vst(in2, coeffs, 32);
+ __lsx_vst(in3, coeffs, 48);
+ __lsx_vst(in4, coeffs, 64);
+ __lsx_vst(in5, coeffs, 80);
+ __lsx_vst(in6, coeffs, 96);
+ __lsx_vst(in7, coeffs, 112);
+}
+
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
+{
+ int16_t i, j, k;
+ int16_t buf[256];
+ int16_t *buf_ptr = &buf[0];
+ int16_t *src = coeffs;
+ const int16_t *filter = >16x16_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
+ in12, in13, in14, in15);
+
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 7);
+
+ src += 8;
+ buf_ptr = (&buf[0] + 8);
+ filter = >16x16_cnst[0];
+ }
+
+ src = &buf[0];
+ buf_ptr = coeffs;
+ filter = >16x16_cnst[0];
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ in0, in8, in1, in9);
+ DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
+ in2, in10, in3, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
+ in4, in12, in5, in13);
+ DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
+ in6, in14, in7, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 12);
+
+ src += 128;
+ buf_ptr = coeffs + 8;
+ filter = >16x16_cnst[0];
+ }
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, coeffs, 0);
+ __lsx_vst(vec1, coeffs, 32);
+ __lsx_vst(vec2, coeffs, 64);
+ __lsx_vst(vec3, coeffs, 96);
+ __lsx_vst(vec4, coeffs, 128);
+ __lsx_vst(vec5, coeffs, 160);
+ __lsx_vst(vec6, coeffs, 192);
+ __lsx_vst(vec7, coeffs, 224);
+
+ src = coeffs + 8;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 128;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in12, in13, in14, in15);
+
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 8;
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+
+ src = coeffs + 136;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+}
+
+static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
+ uint8_t round)
+{
+ uint8_t i;
+ int32_t buf_pitch_2 = buf_pitch << 1;
+ int32_t buf_pitch_4 = buf_pitch << 2;
+ int32_t buf_pitch_8 = buf_pitch << 3;
+ int32_t buf_pitch_16 = buf_pitch << 4;
+
+ const int16_t *filter_ptr0 = >32x32_cnst0[0];
+ const int16_t *filter_ptr1 = >32x32_cnst1[0];
+ const int16_t *filter_ptr2 = >32x32_cnst2[0];
+ const int16_t *filter_ptr3 = >8x8_cnst[0];
+ int16_t *src0 = (coeffs + buf_pitch);
+ int16_t *src1 = (coeffs + buf_pitch_2);
+ int16_t *src2 = (coeffs + buf_pitch_4);
+ int16_t *src3 = (coeffs);
+ int32_t tmp_buf[8 * 32 + 15];
+ int32_t *tmp_buf_ptr = tmp_buf + 15;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* process coeff 4, 12, 20, 28 */
+ in0 = __lsx_vld(src2, 0);
+ in1 = __lsx_vld(src2 + buf_pitch_8, 0);
+ in2 = __lsx_vld(src2 + buf_pitch_16, 0);
+ in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
+ in4 = __lsx_vld(src3, 0);
+ in5 = __lsx_vld(src3 + buf_pitch_8, 0);
+ in6 = __lsx_vld(src3 + buf_pitch_16, 0);
+ in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_l, src1_l, src2_l, src3_l);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 0);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 16);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 32);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 48);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 64);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 80);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 96);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 112);
+
+ /* process coeff 0, 8, 16, 24 */
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
+
+ /* process coeff 2 6 10 14 18 22 26 30 */
+ in0 = __lsx_vld(src1, 0);
+ in1 = __lsx_vld(src1 + buf_pitch_4, 0);
+ in2 = __lsx_vld(src1 + buf_pitch_8, 0);
+ in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
+ in4 = __lsx_vld(src1 + buf_pitch_16, 0);
+ in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
+ in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
+ in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+
+ /* loop for all columns of constants */
+ for (i = 0; i < 8; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
+ __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
+ __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
+ __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
+
+ filter_ptr1 += 8;
+ }
+
+ /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ src0 += 16 * buf_pitch;
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_l, src5_l, src6_l, src7_l);
+
+ /* loop for all columns of filter constants */
+ for (i = 0; i < 16; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+ tmp1_r = sum0_r;
+ tmp1_l = sum0_l;
+
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
+ sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ sum1_r = __lsx_vreplgr2vr_w(round);
+ tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
+ tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp0_l, tmp0_r);
+ __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
+ tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp1_l, tmp1_r);
+ __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
+
+ filter_ptr0 += 16;
+ }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
+ coeffs, 192, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
+ coeffs, 448, in4, in5, in6, in7);
+ coeffs += 8;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, tmp_buf, 0);
+ __lsx_vst(in1, tmp_buf, 16);
+ __lsx_vst(in2, tmp_buf, 32);
+ __lsx_vst(in3, tmp_buf, 48);
+ __lsx_vst(in4, tmp_buf, 64);
+ __lsx_vst(in5, tmp_buf, 80);
+ __lsx_vst(in6, tmp_buf, 96);
+ __lsx_vst(in7, tmp_buf, 112);
+ tmp_buf += 64;
+ }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
+ tmp_buf, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
+ tmp_buf, 112, in4, in5, in6, in7);
+ tmp_buf += 64;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 64);
+ __lsx_vst(in2, coeffs, 128);
+ __lsx_vst(in3, coeffs, 192);
+ __lsx_vst(in4, coeffs, 256);
+ __lsx_vst(in5, coeffs, 320);
+ __lsx_vst(in6, coeffs, 384);
+ __lsx_vst(in7, coeffs, 448);
+ coeffs += 8;
+ }
+}
+
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
+{
+ uint8_t row_cnt, col_cnt;
+ int16_t *src = coeffs;
+ int16_t tmp_buf[8 * 32 + 31];
+ int16_t *tmp_buf_ptr = tmp_buf + 31;
+ uint8_t round;
+ int32_t buf_pitch;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* column transform */
+ round = 7;
+ buf_pitch = 32;
+ for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+ /* process 8x32 blocks */
+ hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
+ }
+
+ /* row transform */
+ round = 12;
+ buf_pitch = 8;
+ for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+ /* process 32x8 blocks */
+ src = (coeffs + 32 * 8 * row_cnt);
+
+ hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+ hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
+ hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+ }
+}
diff --git a/libavcodec/loongarch/hevc_lpf_sao_lsx.c b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
new file mode 100644
index 0000000000..fc10e8eda8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
@@ -0,0 +1,2485 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ * Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src - stride_4x;
+ uint8_t *p2 = src - stride_3x;
+ uint8_t *p1 = src - stride_2x;
+ uint8_t *p0 = src - stride;
+ uint8_t *q0 = src;
+ uint8_t *q1 = src + stride;
+ uint8_t *q2 = src + stride_2x;
+ uint8_t *q3 = src + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i temp0, temp1;
+ __m128i temp2, tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+ dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+ dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+ dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+ dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+ dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+ dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+
+ p_is_pcm0 = p_is_pcm[0];
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm0 = q_is_pcm[0];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+ DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_w(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
+ p3_src, p2_src, p1_src, p0_src);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+ DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
+ q0_src, q1_src, q2_src, q3_src);
+ flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+ abs(p0[0] - q0[0]) < tc250;
+ flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+ abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
+ zero, q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+ abs(p0[4] - q0[4]) < tc254;
+ flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+ abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
+ cmp2 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_w(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp0 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src,
+ __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
+ q_is_pcm_vec));
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
+ tc_neg, tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst1, dst2, dst3, dst4);
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
+ dst2, dst3);
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
+ dst0, dst1);
+
+ p2 += stride;
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ /* weak filter ends */
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack strong results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends */
+
+ /* pack weak results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, delta2);
+
+ /* select between weak or strong */
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ }
+ }
+}
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src;
+ uint8_t *p2 = src + stride_3x;
+ uint8_t *p1 = src + stride_4x;
+ uint8_t *p0 = src + stride_4x + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i cmp3;
+ __m128i temp0, temp1;
+ __m128i temp2;
+ __m128i tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+ dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+ dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+ dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ p_is_pcm0 = p_is_pcm[0];
+ q_is_pcm0 = q_is_pcm[0];
+
+ dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+ dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+ dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+ dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_d(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ src -= 4;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
+ src -= stride_4x;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+ DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+ q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+ q0_src, q1_src, q2_src, q3_src);
+
+ flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+ abs(p3[-1] - p3[0]) < tc250;
+ flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+ abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ cmp0 = __lsx_vreplgr2vr_d(flag0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+
+ flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+ abs(p1[-1] - p1[0]) < tc254;
+ flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+ abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
+ q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ cmp1 = __lsx_vreplgr2vr_d(flag1);
+ cmp2 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_d(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = ((beta + (beta >> 1)) >> 3);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst0, dst1, dst2, dst3);
+ /* weak filter ends */
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
+ cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
+
+ /* transpose */
+ dst4 = __lsx_vilvl_b(dst1, dst0);
+ dst5 = __lsx_vilvh_b(dst1, dst0);
+ dst0 = __lsx_vilvl_h(dst5, dst4);
+ dst1 = __lsx_vilvh_h(dst5, dst4);
+
+ src += 2;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_w(dst0, src + stride, 0, 1);
+ __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_w(dst1, src + stride, 0, 1);
+ __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
+ return;
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends*/
+
+ /* select between weak or strong */
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
+ cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
+ dst4, dst5);
+ }
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
+ p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
+ dst4, dst5);
+
+ /* pack results to 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
+ dst5, dst0, dst1, dst2, dst3);
+
+ /* transpose */
+ DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
+ DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
+
+ src += 1;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_h(dst2, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 1);
+ __lsx_vstelm_h(dst2, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst0, src, 0, 2);
+ __lsx_vstelm_h(dst2, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 3);
+ __lsx_vstelm_h(dst2, src, 4, 6);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_h(dst3, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 1);
+ __lsx_vstelm_h(dst3, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 2);
+ __lsx_vstelm_h(dst3, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 3);
+ __lsx_vstelm_h(dst3, src, 4, 6);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ uint8_t *p1_ptr = src - (stride << 1);
+ uint8_t *p0_ptr = src - stride;
+ uint8_t *q0_ptr = src;
+ uint8_t *q1_ptr = src + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp0 = __lsx_vclip255_h(temp0);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
+
+ temp1 = __lsx_vsub_h(q0, delta);
+ temp1 = __lsx_vclip255_h(temp1);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpickev_b(temp1, temp0);
+ __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
+ __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ src -= 2;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src0, src1, src2, src3);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src4, src5, src6, src7);
+ src -= stride_4x;
+ LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp1 = __lsx_vsub_h(q0, delta);
+ DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
+ q_is_pcm_vec, temp0, temp1);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpackev_b(temp1, temp0);
+
+ src += 1;
+ __lsx_vstelm_h(temp0, src, 0, 0);
+ __lsx_vstelm_h(temp0, src + stride, 0, 1);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_h(temp0, src, 0, 4);
+ __lsx_vstelm_h(temp0, src + stride, 0, 5);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
+ src -= stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
+ __m128i const1 = __lsx_vldi(1);
+ __m128i zero = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
+ sao_offset, sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
+ offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
+ src_minus11, shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
+ src_minus11, shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
+ shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_ptr, *src_minus1;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i sao_offset;
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+ __m128i src_minus10, src_minus11, src_minus12, src_minus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_minus1 = src - 1;
+ src_minus10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src_minus11, src_minus12);
+ src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus1 += 16;
+ dst_ptr = dst + v_cnt;
+ src10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_minus1, src_stride_3x);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
+ src_minus11, shuf1, src12, src_minus12, shuf1, src13,
+ src_minus13, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
+ src_minus11, shuf2, src12, src_minus12, shuf2, src13,
+ src_minus13, shuf2, src_plus10, src_plus11,
+ src_plus12, src_plus13);
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+ src_minus12 = src12;
+ src_minus13 = src13;
+
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr + dst_stride, 0);
+ __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
+ __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i src_zero0, src_zero1;
+ __m128i offset;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
+ src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t h_cnt, v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, dst2, src13, dst3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src = src_orig + v_cnt;
+ dst = dst_orig + v_cnt;
+
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
+ src_minus10, src_minus11);
+
+ for (h_cnt = (height >> 2); h_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src, src_stride_3x, src, src_stride_4x,
+ src10, src11, src12, src13);
+ DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0,\
+ offset_mask0, offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ src_minus10 = src12;
+ DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
+ src12, 128, src_minus11, src10, src11, src12);
+ DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
+ offset_mask1, src11, offset_mask2, src12,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+ src_minus11 = src13;
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+ }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus11, src10, src11;
+ __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
+ src_zero1, src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
+ src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11)
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13, src_minus14, src_plus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
+ __m128i src_zero3, sao_offset, src_plus12;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_minus12, src_minus13);
+ src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 0);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_orig, src_stride_3x);
+ src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_minus12, shuf1, src12, src_minus13, shuf1,
+ src13, src_minus14, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
+ src_minus13, shuf2, src_plus10, src_plus11);
+ src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10,
+ cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
+ 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_minus12 = src11;
+ src_minus13 = src12;
+ src_minus14 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+}
+
+static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig, *dst_orig;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+ __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+ __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+ __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+ __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_plus10, src_plus11);
+ src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 2);
+ src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src11, src12);
+ src13 =__lsx_vldx(src_orig, src_stride_3x);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_plus10, shuf1, src12, src_plus11, shuf1, src13,
+ src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
+ src_plus11, shuf2, src_minus12, src_minus13);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_plus10 = src11;
+ src_plus11 = src12;
+ src_plus12 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height)
+{
+ ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+
+ switch (eo) {
+ case 0:
+ if (width >> 4) {
+ hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 1:
+ if (width >> 4) {
+ hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 2:
+ if (width >> 4) {
+ hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 3:
+ if (width >> 4) {
+ hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+ }
+}
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index fc0e8fb0df..f39674be64 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -85,6 +85,25 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
+
+ c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->idct[0] = ff_hevc_idct_4x4_lsx;
+ c->idct[1] = ff_hevc_idct_8x8_lsx;
+ c->idct[2] = ff_hevc_idct_16x16_lsx;
+ c->idct[3] = ff_hevc_idct_32x32_lsx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3259e03f13..0e73fd1f8e 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,4 +85,30 @@ MC(epel, hv, 32);
#undef MC
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height);
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
More information about the ffmpeg-cvslog
mailing list