[FFmpeg-cvslog] lavc/h263dsp: R-V V {h,v}_loop_filter
Rémi Denis-Courmont
git at videolan.org
Wed May 22 19:33:10 EEST 2024
ffmpeg | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Sun May 19 10:03:29 2024 +0300| [910d281b215720dc831486aba36e31528b30b6bc] | committer: Rémi Denis-Courmont
lavc/h263dsp: R-V V {h,v}_loop_filter
Since the horizontal and vertical filters are identical except for a
transposition, this uses a common subprocedure with an ad-hoc ABI.
To preserve return-address stack prediction, a link register has to be
used (c.f. the "Control Transfer Instructions" from the
RISC-V ISA Manual). The alternate/temporary link register T0 is used
here, so that the normal RA is preserved (something Arm cannot do!).
To load the strength value based on `qscale`, the shortest possible
and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic
LLA; ADD; LBU sequence would add one more instruction since LLA is a
convenience alias for AUIPC; ADDI. To ensure that this trick works,
relocation relaxation is disabled.
To implement the two signed divisions by a power of two toward zero:
(x / (1 << SHIFT))
the code relies on the small range of integers involved, computing:
(x + (x >> (16 - SHIFT))) >> SHIFT
rather than the more general:
(x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT
Thus one ANDI instruction is avoided.
T-Head C908:
h263dsp.h_loop_filter_c: 228.2
h263dsp.h_loop_filter_rvv_i32: 144.0
h263dsp.v_loop_filter_c: 242.7
h263dsp.v_loop_filter_rvv_i32: 114.0
(C is probably worse in real use due to less predictible branches.)
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=910d281b215720dc831486aba36e31528b30b6bc
---
libavcodec/h263dsp.c | 4 +-
libavcodec/h263dsp.h | 1 +
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h263dsp_init.c | 41 ++++++++++++++++
libavcodec/riscv/h263dsp_rvv.S | 100 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 147 insertions(+), 1 deletion(-)
diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 8fa2d3c297..6a13353499 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -119,7 +119,9 @@ av_cold void ff_h263dsp_init(H263DSPContext *ctx)
ctx->h263_h_loop_filter = h263_h_loop_filter_c;
ctx->h263_v_loop_filter = h263_v_loop_filter_c;
-#if ARCH_X86
+#if ARCH_RISCV
+ ff_h263dsp_init_riscv(ctx);
+#elif ARCH_X86
ff_h263dsp_init_x86(ctx);
#elif ARCH_MIPS
ff_h263dsp_init_mips(ctx);
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 1abea3ca8c..2dccd23392 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -29,6 +29,7 @@ typedef struct H263DSPContext {
} H263DSPContext;
void ff_h263dsp_init(H263DSPContext *ctx);
+void ff_h263dsp_init_riscv(H263DSPContext *ctx);
void ff_h263dsp_init_x86(H263DSPContext *ctx);
void ff_h263dsp_init_mips(H263DSPContext *ctx);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 67e198d754..e608436aa4 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -26,6 +26,8 @@ OBJS-$(CONFIG_G722DSP) += riscv/g722dsp_init.o
RVV-OBJS-$(CONFIG_G722DSP) += riscv/g722dsp_rvv.o
OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_init.o
RVV-OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_rvv.o
+OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_init.o
+RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
diff --git a/libavcodec/riscv/h263dsp_init.c b/libavcodec/riscv/h263dsp_init.c
new file mode 100644
index 0000000000..21b536366c
--- /dev/null
+++ b/libavcodec/riscv/h263dsp_init.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/h263dsp.h"
+
+void ff_h263_h_loop_filter_rvv(uint8_t *src, int stride, int q);
+void ff_h263_v_loop_filter_rvv(uint8_t *src, int stride, int q);
+
+av_cold void ff_h263dsp_init_riscv(H263DSPContext *c)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+ c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
+ c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+ }
+#endif
+}
diff --git a/libavcodec/riscv/h263dsp_rvv.S b/libavcodec/riscv/h263dsp_rvv.S
new file mode 100644
index 0000000000..97503d527c
--- /dev/null
+++ b/libavcodec/riscv/h263dsp_rvv.S
@@ -0,0 +1,100 @@
+/*
+ * Copyright © 2024 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+ .option push
+ .option norelax
+func ff_h263_h_loop_filter_rvv, zve32x
+ addi a0, a0, -2
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vlsseg4e8.v v8, (a0), a1
+ jal t0, 1f
+ vssseg4e8.v v8, (a0), a1
+ ret
+1:
+ csrwi vxrm, 0
+2: auipc t1, %pcrel_hi(ff_h263_loop_filter_strength)
+ vwsubu.vv v14, v10, v9 # p2 - p1
+ add t1, t1, a2
+ vwsubu.vv v12, v8, v11 # p0 - p3
+ vsetvli zero, zero, e16, m1, ta, mu
+ vsll.vi v14, v14, 2
+ lbu t1, %pcrel_lo(2b)(t1) # strength
+ vadd.vv v16, v12, v14
+ # Divide by 8 toward 0. v16 is a signed 10-bit value at this point.
+ vsrl.vi v18, v16, 16 - 3 # v18 = (v16 < 0) ? 7 : 0
+ slli t2, t1, 1 # 2 * strength
+ vadd.vv v16, v16, v18
+ # v16 (d) is signed 7-bit, but later arithmetics require 9 bits.
+ vsra.vi v16, v16, 3 # d
+ vmv.v.x v20, t2
+ vmslt.vi v0, v16, 0
+ vneg.v v18, v16
+ vneg.v v20, v20, v0.t # sign(d) * 2 * strength
+ vmax.vv v18, v16, v18 # |d|
+ vsub.vv v20, v20, v16 # d1 if strength <= |d| <= 2 * strength
+ vmsge.vx v0, v18, t2
+ vsrl.vi v14, v12, 16 - 2 # v14 = (v12 < 0) ? 3 : 0
+ vmerge.vxm v20, v20, zero, v0 # d1 if strength <= |d|
+ vadd.vv v12, v12, v14
+ vmsge.vx v0, v18, t1
+ vsra.vi v12, v12, 2 # (p0 - p3) / 4
+ vmerge.vvm v16, v16, v20, v0 # d1
+ vzext.vf2 v24, v8 # p0 as u16 (because vwrsubu.wv does not exist)
+ vneg.v v14, v16
+ vzext.vf2 v26, v9 # p1 as u16
+ vmax.vv v14, v16, v14 # |d1|
+ vzext.vf2 v28, v10 # p2 as u16
+ vsra.vi v14, v14, 1 # ad1
+ vadd.vv v26, v26, v16 # p1 + d1
+ vneg.v v18, v14 # -ad1
+ vmin.vv v12, v12, v14
+ vsub.vv v28, v28, v16 # p2 - d1
+ vmax.vv v12, v12, v18 # d2
+ vmax.vx v26, v26, zero
+ vsub.vv v24, v24, v12 # p0 - d2
+ vmax.vx v28, v28, zero
+ vsetvli zero, zero, e8, mf2, ta, ma
+ vwaddu.wv v30, v12, v11 # p3 + d2
+ vncvt.x.x.w v8, v24
+ vnclipu.wi v9, v26, 0
+ vnclipu.wi v10, v28, 0
+ vncvt.x.x.w v11, v30
+ jr t0
+endfunc
+ .option pop
+
+func ff_h263_v_loop_filter_rvv, zve32x
+ sub a4, a0, a1
+ vsetivli zero, 8, e8, mf2, ta, ma
+ vle8.v v10, (a0)
+ sub a3, a4, a1
+ vle8.v v9, (a4)
+ add a5, a0, a1
+ vle8.v v8, (a3)
+ vle8.v v11, (a5)
+ jal t0, 1b
+ vse8.v v8, (a3)
+ vse8.v v9, (a4)
+ vse8.v v10, (a0)
+ vse8.v v11, (a5)
+ ret
+endfunc
More information about the ffmpeg-cvslog
mailing list