[FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
daichengrong at iscas.ac.cn
daichengrong at iscas.ac.cn
Tue May 20 10:58:06 EEST 2025
From: daichengrong <daichengrong at iscas.ac.cn>
Since there are no comments for v2 and v3, we have continued to optimize according to the comments of v1.
We spilled the slide to memory to help improve performance,and optimized the extraction of elements from vector registers.
On Banana PI F3:
hevc_idct_32x32_8_c: 119920.0 ( 1.00x)
hevc_idct_32x32_8_rvv_i64: 20247.3 ( 5.92x) (V4)
hevc_idct_32x32_8_rvv_i64: 28718.3 ( 4.14x) (V3)
hevc_idct_32x32_8_rvv_i64: 28503.7 ( 4.17x) (V2)
hevc_idct_32x32_8_rvv_i64: 51254.4 ( 2.33x) (V1)
Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers
Changes in v3:
remove the slides in transposition and spill values from vector registers to stack
Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers
---
libavcodec/riscv/Makefile | 1 +
libavcodec/riscv/hevcdsp_idct_rvv.S | 957 ++++++++++++++++++++++++++++
libavcodec/riscv/hevcdsp_init.c | 52 +-
3 files changed, 990 insertions(+), 20 deletions(-)
create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..586c97bdf9
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,957 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+ .2byte 64, 83, 64, 36
+ .2byte 89, 75, 50, 18
+ .2byte 90, 87, 80, 70
+ .2byte 57, 43, 25, 9
+ .2byte 90, 90, 88, 85
+ .2byte 82, 78, 73, 67
+ .2byte 61, 54, 46, 38
+ .2byte 31, 22, 13, 4
+endconst
+
+const trans_index, align=4
+ .2byte 0, 16, 32, 48, 62, 46, 30, 14
+ .2byte 2, 18, 34, 50, 60, 44, 28, 12
+ .2byte 4, 20, 36, 52, 58, 42, 26, 10
+ .2byte 6, 22, 38, 54, 56, 40, 24, 8
+endconst
+
+.macro sum_sub out, in, c, op, p
+ mv t0, \c
+ .ifc \op, -
+ neg t0, t0
+ .endif
+ vsetivli zero, 4, e16, mf2, ta, ma
+ .ifc \p, 2
+ vslidedown.vi v8, \in, 4
+ vwmacc.vx \out, t0, v8
+ .else
+ vwmacc.vx \out, t0, \in
+ .endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+ .ifc \op0, -
+ neg \t0, \t0
+ .endif
+ .ifc \op1, -
+ neg \t1, \t1
+ .endif
+ .ifc \op2, -
+ neg \t2, \t2
+ .endif
+ .ifc \op3, -
+ neg \t3, \t3
+ .endif
+
+.ifc \p, 2
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vslidedown.vi v8, \in, 4
+
+ vwmacc.vx v24, \t0, v8
+ vwmacc.vx v25, \t1, v8
+ vwmacc.vx v26, \t2, v8
+ vwmacc.vx v27, \t3, v8
+
+.else
+
+ vwmacc.vx v24, \t0, \in
+ vwmacc.vx v25, \t1, \in
+ vwmacc.vx v26, \t2, \in
+ vwmacc.vx v27, \t3, \in
+.endif
+
+ .ifc \op0, -
+ neg \t0, \t0
+ .endif
+ .ifc \op1, -
+ neg \t1, \t1
+ .endif
+ .ifc \op2, -
+ neg \t2, \t2
+ .endif
+ .ifc \op3, -
+ neg \t3, \t3
+ .endif
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+ vsetivli zero, 4, e32, m1, ta, ma
+ vadd.vv \tmp_p, \e, \o
+ vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+ vsetivli zero, 4, e32, m1, ta, ma
+ vadd.vv v20, \in0, \in1
+ vsub.vv \in0, \in0, \in1
+ vadd.vv \in1, \in2, \in3
+ vsub.vv \in2, \in2, \in3
+ vadd.vv \in3, \in4, \in5
+ vsub.vv \in4, \in4, \in5
+ vadd.vv \in5, \in6, \in7
+ vsub.vv \in6, \in6, \in7
+.endm
+
+.macro multiply in
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v \in, (s0)
+ ld s2, 0*2(s0)
+ ld s3, 1*2(s0)
+ ld s4, 2*2(s0)
+ ld s5, 3*2(s0)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vwmul.vx v24, v4, s2
+ vwmul.vx v25, v4, s3
+ vwmul.vx v26, v4, s4
+ vwmul.vx v27, v4, s5
+.endm
+
+func tr_block1, zve64x
+ multiply v0
+
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x11, x14, x17, x20, +, +, +, +, 2
+ add_member32 v5, x12, x17, x22, x24, +, +, +, -
+ add_member32 v5, x13, x20, x24, x17, +, +, -, -, 2
+ add_member32 v6, x14, x23, x19, x10, +, +, -, -
+ add_member32 v6, x15, x25, x14, x16, +, -, -, -, 2
+ add_member32 v7, x16, x22, x10, x23, +, -, -, -
+ add_member32 v7, x17, x19, x15, x21, +, -, -, +, 2
+ add_member32 v16, x18, x16, x20, x14, +, -, -, +
+ add_member32 v16, x19, x13, x25, x12, +, -, -, +, 2
+ add_member32 v17, x20, x11, x21, x19, +, -, +, +
+ add_member32 v17, x21, x12, x16, x25, +, -, +, -, 2
+ add_member32 v18, x22, x15, x11, x18, +, -, +, -
+ add_member32 v18, x23, x18, x13, x11, +, -, +, -, 2
+ add_member32 v19, x24, x21, x18, x15, +, -, +, -
+ add_member32 v19, x25, x24, x23, x22, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block2, zve64x
+ multiply v1
+
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x23, x25, x22, x19, +, -, -, -, 2
+ add_member32 v5, x19, x14, x10, x15, -, -, -, -
+ add_member32 v5, x10, x16, x23, x21, -, -, -, +, 2
+ add_member32 v6, x18, x24, x15, x13, -, +, +, +
+ add_member32 v6, x24, x13, x17, x23, +, +, +, -, 2
+ add_member32 v7, x15, x17, x21, x10, +, +, -, -
+ add_member32 v7, x13, x23, x11, x25, +, -, -, +, 2
+ add_member32 v16, x22, x12, x24, x11, +, -, -, +
+ add_member32 v16, x20, x18, x14, x24, -, -, +, +, 2
+ add_member32 v17, x11, x22, x18, x12, -, +, +, -
+ add_member32 v17, x17, x11, x20, x22, -, +, -, -, 2
+ add_member32 v18, x25, x19, x12, x14, +, +, -, +
+ add_member32 v18, x16, x21, x25, x20, +, -, -, +, 2
+ add_member32 v19, x12, x11, x13, x16, +, -, +, -
+ add_member32 v19, x21, x20, x19, x18, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block3, zve64x
+ multiply v2
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x16, x13, x10, x12, -, -, -, -, 2
+ add_member32 v5, x20, x25, x21, x16, -, -, +, +
+ add_member32 v5, x14, x12, x19, x25, +, +, +, -, 2
+ add_member32 v6, x22, x20, x11, x17, +, -, -, -
+ add_member32 v6, x12, x18, x22, x10, -, -, +, +, 2
+ add_member32 v7, x24, x14, x18, x20, -, +, +, -
+ add_member32 v7, x10, x24, x12, x22, +, +, -, -, 2
+ add_member32 v16, x25, x11, x23, x13, -, -, +, +
+ add_member32 v16, x11, x21, x17, x15, -, +, +, -, 2
+ add_member32 v17, x23, x17, x13, x24, +, +, -, +
+ add_member32 v17, x13, x15, x24, x18, +, -, +, +, 2
+ add_member32 v18, x21, x23, x16, x11, -, -, +, -
+ add_member32 v18, x15, x10, x14, x19, -, +, -, +, 2
+ add_member32 v19, x19, x22, x25, x23, +, -, +, +
+ add_member32 v19, x17, x16, x15, x14, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block4, zve64x
+ multiply v3
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x15, x18, x21, x24, -, -, -, -, 2
+ add_member32 v5, x10, x13, x18, x23, +, +, +, +
+ add_member32 v5, x18, x10, x15, x22, -, -, -, -, 2
+ add_member32 v6, x25, x16, x12, x21, +, +, +, +
+ add_member32 v6, x19, x21, x10, x20, +, -, -, -, 2
+ add_member32 v7, x12, x25, x13, x19, -, -, +, +
+ add_member32 v7, x14, x20, x16, x18, +, +, -, -, 2
+ add_member32 v16, x21, x15, x19, x17, -, -, +, +
+ add_member32 v16, x23, x11, x22, x16, -, +, -, -, 2
+ add_member32 v17, x16, x14, x25, x15, +, -, +, +
+ add_member32 v17, x11, x19, x23, x14, -, +, +, -, 2
+ add_member32 v18, x17, x24, x20, x13, +, -, -, +
+ add_member32 v18, x24, x22, x17, x12, -, -, +, -, 2
+ add_member32 v19, x20, x17, x14, x11, -, +, -, +
+ add_member32 v19, x13, x12, x11, x10, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+.macro butterfly32 in0, in1, in2, in3, out
+ vsetivli zero, 4, e32, m1, ta, ma
+ vadd.vv \out, \in0, \in1
+ vsub.vv \in0, \in0, \in1
+ vadd.vv \in1, \in2, \in3
+ vsub.vv \in2, \in2, \in3
+.endm
+
+.macro load16 in0, in1, in2, in3
+ sub t0, a3, a1
+
+ vsetivli zero, 2, e64, m1, ta, ma
+ vlse64.v \in0, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v \in1, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v \in2, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v \in3, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+.endm
+
+.macro store16 in0, in1, in2, in3, rx
+ sub t0, a3, a1
+ vsetivli zero, 2, e64, m1, ta, ma
+ vsse64.v \in0, (a1), t0
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ sub t0, a3, a1
+ vsse64.v \in1, (a1), t0
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ sub t0, a3, a1
+ vsse64.v \in2, (a1), t0
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ sub t0, a3, a1
+ vsse64.v \in3, (a1), t0
+ add a1, a1, a2
+ add a3, a3, \rx
+.endm
+
+.macro load32
+ addi a1, a5, 64
+ addi a3, a1, 128
+ li a2, 256
+
+ sub t0, a3, a1
+ vsetivli zero, 2, e64, m1, ta, ma
+ vlse64.v v4, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v5, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v6, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v7, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v16, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v17, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v18, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vlse64.v v19, (a1), t0
+ add a1, a1, a2
+ add a3, a3, a2
+
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2, op3, op4, op5, op6, op7, p
+ .ifc \op0, -
+ neg \tt0, \tt0
+ .endif
+ .ifc \op1, -
+ neg \tt1, \tt1
+ .endif
+ .ifc \op2, -
+ neg \tt2, \tt2
+ .endif
+ .ifc \op3, -
+ neg \tt3, \tt3
+ .endif
+ .ifc \op4, -
+ neg \tt4, \tt4
+ .endif
+ .ifc \op5, -
+ neg \tt5, \tt5
+ .endif
+ .ifc \op6, -
+ neg \tt6, \tt6
+ .endif
+ .ifc \op7, -
+ neg \tt7, \tt7
+ .endif
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ .ifc \p, 2
+ vslidedown.vi v8, \in, 4
+ .else
+ vmv.v.v v8, \in
+ .endif
+
+ vwmacc.vx v21, \tt0, v8
+ vwmacc.vx v22, \tt1, v8
+ vwmacc.vx v23, \tt2, v8
+ vwmacc.vx v24, \tt3, v8
+ vwmacc.vx v25, \tt4, v8
+ vwmacc.vx v26, \tt5, v8
+ vwmacc.vx v27, \tt6, v8
+ vwmacc.vx v28, \tt7, v8
+
+ .ifc \op0, -
+ neg \tt0, \tt0
+ .endif
+ .ifc \op1, -
+ neg \tt1, \tt1
+ .endif
+ .ifc \op2, -
+ neg \tt2, \tt2
+ .endif
+ .ifc \op3, -
+ neg \tt3, \tt3
+ .endif
+ .ifc \op4, -
+ neg \tt4, \tt4
+ .endif
+ .ifc \op5, -
+ neg \tt5, \tt5
+ .endif
+ .ifc \op6, -
+ neg \tt6, \tt6
+ .endif
+ .ifc \op7, -
+ neg \tt7, \tt7
+ .endif
+.endm
+
+.macro scale_store shift
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v28, (a4)
+ addi a4, a4, 2*8
+ vle16.v v29, (a4)
+ addi a4, a4, 2*8
+ vle16.v v30, (a4)
+ addi a4, a4, 2*8
+ vle16.v v31, (a4)
+ addi a4, a4, 2*8
+
+ butterfly32 v28, v24, v29, v25, v2
+ butterfly32 v30, v26, v31, v27, v3
+
+ scale v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift
+
+ transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+
+ store16 v20, v21, v22, v23, t1
+
+ vsetivli zero, 4, e16, m1, ta, ma
+ vle16.v v2, (t2)
+ addi t2, t2, 8
+ vle16.v v3, (t2)
+ addi t2, t2, -8
+.endm
+
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+ li a7, \off1
+ add a1, sp, a7
+ li a7, \off2
+ add a3, sp, a7
+ li a2, -16
+ li a4, 16
+
+ vsetivli zero, 4, e32, m1, ta, ma
+ vse32.v \in0, (a1)
+ add a1, a1, a4
+ vse32.v \in1, (a3)
+ add a3, a3, a2
+ vse32.v \in2, (a1)
+ add a1, a1, a4
+ vse32.v \in3, (a3)
+ add a3, a3, a2
+ vse32.v \in4, (a1)
+ add a1, a1, a4
+ vse32.v \in5, (a3)
+ add a3, a3, a2
+ vse32.v \in6, (a1)
+ vse32.v \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+ vsetivli zero, 8, e16, m1, ta, ma
+ vse16.v v\r0, (s1)
+ add t0, s1, 16
+ vse16.v v\r1, (t0)
+ add t0, t0, 16
+ vse16.v v\r2, (t0)
+ add t0, t0, 16
+ vse16.v v\r3, (t0)
+
+ li t0, 32
+ vsetvli zero, t0, e16, m4, ta, ma
+ lla t0, trans_index
+ vle16.v v\tmp0, (t0)
+ vluxei16.v v\tmp0, (s1), v\tmp0
+ vse16.v v\tmp0, (s1)
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v\r0, (s1)
+ add t0, s1, 16
+ vle16.v v\r1, (t0)
+ add t0, t0, 16
+ vle16.v v\r2, (t0)
+ add t0, t0, 16
+ vle16.v v\r3, (t0)
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+ tr_4x4_8 \in0, \in1, \in2, \in3, v24, v25, v26, v27
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ lh s6, 4*2(s0)
+ lh s7, 5*2(s0)
+ lh s8, 6*2(s0)
+ lh s9, 7*2(s0)
+
+ neg s2, s6
+ neg s4, s8
+ neg s5, s9
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vslidedown.vi v8, \in0, 4
+ vwmul.vx v28, v8, s6
+ vwmul.vx v29, v8, s7
+ vwmul.vx v30, v8, s8
+ vwmul.vx v31, v8, s9
+
+ vslidedown.vi v8, \in1, 4
+ vwmacc.vx v28, s7, v8
+ vwmacc.vx v29, s5, v8
+ vwmacc.vx v30, s2, v8
+ vwmacc.vx v31, s4, v8
+
+
+ vslidedown.vi v8, \in2, 4
+ vwmacc.vx v28, s8, v8
+ vwmacc.vx v29, s2, v8
+ vwmacc.vx v30, s9, v8
+ vwmacc.vx v31, s7, v8
+
+ vslidedown.vi v8, \in3, 4
+ vwmacc.vx v28, s9, v8
+ vwmacc.vx v29, s4, v8
+ vwmacc.vx v30, s7, v8
+ vwmacc.vx v31, s2, v8
+
+
+ butterfly v24, v28, v16, v23
+ butterfly v25, v29, v17, v22
+ butterfly v26, v30, v18, v21
+ butterfly v27, v31, v19, v20
+
+ li a7, \offset
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, ta, ma
+ vse32.v v16, (a4)
+ add a4, a4, 16
+ vse32.v v17, (a4)
+ add a4, a4, 16
+ vse32.v v18, (a4)
+ add a4, a4, 16
+ vse32.v v19, (a4)
+ add a4, a4, 16
+
+ vse32.v v20, (a4)
+ add a4, a4, 16
+ vse32.v v21, (a4)
+ add a4, a4, 16
+ vse32.v v22, (a4)
+ add a4, a4, 16
+ vse32.v v23, (a4)
+ add a4, a4, 16
+
+ add a4, a4, -64
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi v8, \in1\(), \shift
+ vsetivli zero, 2, e64, m1, ta, ma
+ vslideup.vi \out0\(), v8, 1
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi \out0\(), \in0\(), \shift
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi v8, \in3\(), \shift
+ vsetivli zero, 2, e64, m1, ta, ma
+ vslideup.vi \out1\(), v8, 1
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi \out1\(), \in2\(), \shift
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi v8, \in5\(), \shift
+ vsetivli zero, 2, e64, m1, ta, ma
+ vslideup.vi \out2\(), v8, 1
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi \out2\(), \in4\(), \shift
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi v8, \in7\(), \shift
+ vsetivli zero, 2, e64, m1, ta, ma
+ vslideup.vi \out3\(), v8, 1
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vnclip.wi \out3\(), \in6\(), \shift
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+ vsetivli zero, 4, e16, m1, ta, ma
+ vwcvt.x.x.v v8, \in0
+ vsetivli zero, 4, e32, m1, ta, ma
+ vsll.vi v28, v8, 6
+
+ vsetivli zero, 16, e8, m1, ta, ma
+ vmv.v.v v29, v28
+
+ vsetivli zero, 4, e16, m1, ta, ma
+ vse16.v v0, (s0)
+ lh s2, 0*2(s0)
+ lh s3, 1*2(s0)
+ lh s5, 3*2(s0)
+
+ neg s6, s2
+ neg s7, s3
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vwmul.vx v30, \in1, s3
+ vwmul.vx v31, \in1, s5
+ vwmacc.vx v28, s2, \in2
+
+ vwmacc.vx v29, s6, \in2
+ vwmacc.vx v30, s5, \in3
+ vwmacc.vx v31, s7, \in3
+
+ vsetivli zero, 4, e32, m1, ta, ma
+ vadd.vv \out0, v28, v30
+ vadd.vv \out1, v29, v31
+ vsub.vv \out2, v29, v31
+ vsub.vv \out3, v28, v30
+.endm
+
+.macro tr_16x4 name, shift, offset, step
+func func_tr_16x4_\name, zve64x
+ mv a1, a5
+ addi a3, a5, \step * 64
+ li a2, \step * 128
+ load16 v16, v17, v18, v19
+
+ lla a1, trans
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a1)
+
+ tr16_8x4 v16, v17, v18, v19, \offset
+
+ addi a1, a5, \step * 32
+ addi a3, a5, \step * 3 *32
+ li a2, \step * 128
+ load16 v20, v17, v18, v19
+
+ lla a1, trans
+ addi a1, a1, 16
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v1, (a1)
+
+ lh s2, 0*2(a1)
+ lh s3, 1*2(a1)
+ lh s4, 2*2(a1)
+ lh s5, 3*2(a1)
+ lh s6, 4*2(a1)
+ lh s7, 5*2(a1)
+ lh s8, 6*2(a1)
+ lh s9, 7*2(a1)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vwmul.vx v21, v20, s2
+ vwmul.vx v22, v20, s3
+ vwmul.vx v23, v20, s4
+ vwmul.vx v24, v20, s5
+ vwmul.vx v25, v20, s6
+ vwmul.vx v26, v20, s7
+ vwmul.vx v27, v20, s8
+ vwmul.vx v28, v20, s9
+
+ add_member v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -, -, -, 2
+ add_member v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +, +, +
+ add_member v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +, -, -, 2
+ add_member v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -, +, +
+ add_member v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +, +, -, 2
+ add_member v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +, -, +
+ add_member v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -, +, -, 2
+
+ li a7, \offset
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, ta, ma
+ vle32.v v16, (a4)
+ addi a4, a4, 16
+ vle32.v v17, (a4)
+ addi a4, a4, 16
+ vle32.v v18, (a4)
+ addi a4, a4, 16
+ vle32.v v19, (a4)
+ addi a4, a4, 16
+
+ butterfly16 v16, v21, v17, v22, v18, v23, v19, v24
+ .if \shift > 0
+ scale v29, v30, v31, v24, v20, v16, v21, v17, v22, v18, v23, v19, \shift
+
+ transpose16_4x4_2 29, 30, 31, 24, 4, 5, 6, 7, 2, 3
+
+ mv a1, a6
+ addi a3, a6, 24 +3*32
+ li a2, 32
+ li a4, -32
+
+ store16 v29, v30, v31, v24, a4
+ .else
+ store_to_stack \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16
+ .endif
+
+ li a7, \offset+64
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, ta, ma
+ vle32.v v16, (a4)
+ addi a4, a4, 16
+ vle32.v v17, (a4)
+ addi a4, a4, 16
+ vle32.v v18, (a4)
+ addi a4, a4, 16
+ vle32.v v19, (a4)
+ addi a4, a4, 16
+
+ butterfly16 v16, v25, v17, v26, v18, v27, v19, v28
+ .if \shift > 0
+ scale v29, v30, v31, v20, v20, v16, v25, v17, v26, v18, v27, v19, \shift
+
+ transpose16_4x4_2 29, 30, 31, 20, 4, 5, 6, 7, 2, 3
+
+ add a1, a6, 8
+ add a3, a6, (16 + 3 * 32)
+ li a2, 32
+ li a4, -32
+ store16 v29, v30, v31, v20, a4
+ .else
+ store_to_stack (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16
+ .endif
+ ret
+endfunc
+.endm
+
+tr_16x4 noscale, 0, 2048, 4
+
+.macro tr_32x4 name, shift
+func func_tr_32x4_\name, zve64x
+ mv t3, ra
+ jal func_tr_16x4_noscale
+
+ load32
+
+ lla t2, trans
+ addi t2, t2, 32
+
+ vsetivli zero, 4, e16, m1, ta, ma
+ vle16.v v0, (t2)
+ addi t2, t2, 2*4
+ vle16.v v1, (t2)
+ addi t2, t2, 2*4
+ vle16.v v2, (t2)
+ addi t2, t2, 2*4
+ vle16.v v3, (t2)
+ addi t2, t2, -2*4
+
+ li a7, 2048
+ add a4, sp, a7
+
+ li a2, 64
+ li t1, -64
+
+ jal tr_block1
+ mv a1, t4
+ addi a3, t4, (56 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block2
+ addi a1, t4, 8
+ addi a3, t4, (48 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block3
+ addi a1, t4, 16
+ addi a3, t4, (40 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block4
+ addi a1, t4, 24
+ addi a3, t4, (32 + 3 * 64)
+ scale_store \shift
+
+ jr t3
+endfunc
+.endm
+
+tr_32x4 firstpass, 7
+tr_32x4 secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+ mv t6, ra
+ addi sp, sp, -8*13
+ sd ra, 8*12(sp)
+ sd s0, 8*11(sp)
+ sd s1, 8*10(sp)
+ sd s2, 8*9(sp)
+ sd s3, 8*8(sp)
+ sd s4, 8*7(sp)
+ sd s5, 8*6(sp)
+ sd s6, 8*5(sp)
+ sd s7, 8*4(sp)
+ sd s8, 8*3(sp)
+ sd s9, 8*2(sp)
+ sd s10, 8*1(sp)
+ sd s11, 8*0(sp)
+
+ add sp, sp, -16
+ mv s0, sp
+
+ add sp, sp, -64
+ mv s1, sp
+
+ csrwi vxrm, 1
+ li a7, 2432
+ sub sp, sp, a7
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ li a7, 8 * \i
+ add a5, a0, a7
+
+ li a7, 8 * \i * 32
+ add t4, sp, a7
+ jal func_tr_32x4_firstpass
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ addi a5, sp, 8 * \i
+ addi t4, a0, 8 * \i * 32
+ jal func_tr_32x4_secondpass_\bitdepth
+.endr
+
+ li a7, 2432
+ add sp, sp, a7
+
+ add sp, sp, 80
+
+ ld ra, 8*12(sp)
+ ld s0, 8*11(sp)
+ ld s1, 8*10(sp)
+ ld s2, 8*9(sp)
+ ld s3, 8*8(sp)
+ ld s4, 8*7(sp)
+ ld s5, 8*6(sp)
+ ld s6, 8*5(sp)
+ ld s7, 8*4(sp)
+ ld s8, 8*3(sp)
+ ld s9, 8*2(sp)
+ ld s10, 8*1(sp)
+ ld s11, 8*0(sp)
+ addi sp, sp, 8*13
+
+ jr t6
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
#include "libavcodec/hevc/dsp.h"
#include "libavcodec/riscv/h26x/h2656dsp.h"
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
#define RVV_FNASSIGN(member, v, h, fn, ext) \
member[1][v][h] = ff_h2656_put_pixels_##8_##ext; \
member[3][v][h] = ff_h2656_put_pixels_##8_##ext; \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth)
const int flags = av_get_cpu_flags();
int vlenb;
- if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
- return;
-
vlenb = ff_get_rv_vlenb();
- if (vlenb >= 32) {
- switch (bit_depth) {
- case 8:
- RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
- RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
- break;
- default:
- break;
- }
- } else if (vlenb >= 16) {
- switch (bit_depth) {
- case 8:
- RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
- RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
- break;
- default:
- break;
+
+ if (flags & AV_CPU_FLAG_RVV_I64)
+ if (vlenb >= 16)
+ switch (bit_depth) {
+ case 8:
+ c->idct[3] = ff_hevc_idct_32x32_8_rvv;
+ break;
+ default:
+ break;
+ }
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+ if (vlenb >= 32) {
+ switch (bit_depth) {
+ case 8:
+ RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+ RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+ break;
+ default:
+ break;
+ }
+ } else if (vlenb >= 16) {
+ switch (bit_depth) {
+ case 8:
+ RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+ RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+ break;
+ default:
+ break;
+ }
}
}
#endif
--
2.25.1
More information about the ffmpeg-devel
mailing list