[FFmpeg-devel] [PATCH] lavc/vp7dsp: add R-V V vp7_luma_dc_wht

Rémi Denis-Courmont remi at remlab.net
Sun May 26 12:28:20 EEST 2024


This works out a bit more favourably than VP8's due to:
- additional multiplications that can be vectored,
- hardware-supported fixed-point rounding mode.

vp7_luma_dc_wht_c:       3.2
vp7_luma_dc_wht_rvv_i64: 2.0
---
 libavcodec/riscv/Makefile      |  2 +
 libavcodec/riscv/vp7dsp_init.c | 41 +++++++++++++++
 libavcodec/riscv/vp7dsp_rvv.S  | 96 ++++++++++++++++++++++++++++++++++
 libavcodec/vp8dsp.c            |  4 ++
 libavcodec/vp8dsp.h            |  2 +
 5 files changed, 145 insertions(+)
 create mode 100644 libavcodec/riscv/vp7dsp_init.c
 create mode 100644 libavcodec/riscv/vp7dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index e608436aa4..590655f829 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -65,6 +65,8 @@ RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
 OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
 RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
+OBJS-$(CONFIG_VP7_DECODER) += riscv/vp7dsp_init.o
+RVV-OBJS-$(CONFIG_VP7_DECODER) += riscv/vp7dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
 RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
new file mode 100644
index 0000000000..6d9aba43d9
--- /dev/null
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vp8dsp.h"
+
+void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
+
+av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
+#if __riscv_xlen >= 64
+        c->vp8_luma_dc_wht = ff_vp7_luma_dc_wht_rvv;
+#endif
+    }
+#endif
+}
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
new file mode 100644
index 0000000000..fb21e9a595
--- /dev/null
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_vp7_luma_dc_wht_rvv, zve32x
+        csrwi       vxrm, 0
+        li          t4, 12540
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vlseg4e16.v v0, (a1)
+        li          t6, 30274
+        vwmul.vx    v8, v1, t4
+        li          t5, 23170
+        vwmul.vx    v9, v3, t6
+        addi        t1, sp, -12 * 2
+        vwmul.vx    v10, v1, t6
+        addi        t2, sp, -8 * 2
+        vwmul.vx    v11, v3, t4
+        addi        t3, sp, -4 * 2
+        vwadd.vv    v4, v0, v2
+        addi        sp, sp, -16 * 2
+        vwsub.vv    v5, v0, v2
+        vsetvli     zero, zero, e32, m1, ta, ma
+        vadd.vv     v7, v10, v11
+        vmul.vx     v4, v4, t5
+        vsub.vv     v6, v8, v9
+        vmul.vx     v5, v5, t5
+        vadd.vv     v0, v4, v7
+        vsub.vv     v3, v4, v7
+        vadd.vv     v1, v5, v6
+        vsub.vv     v2, v5, v6
+        vsetvli     zero, zero, e16, mf2, ta, ma
+        vnsra.wi    v4, v0, 14
+        vnsra.wi    v7, v3, 14
+        vnsra.wi    v5, v1, 14
+        vnsra.wi    v6, v2, 14
+        vsseg4e16.v v4, (sp)
+        vle16.v     v0, (sp)
+        vle16.v     v1, (t1)
+        vle16.v     v2, (t2)
+        vle16.v     v3, (t3)
+        vwmul.vx    v8, v1, t4
+        li          t0, 16 * 2
+        vwmul.vx    v9, v3, t6
+        addi        t1, a0, 1 * 4 * 16 * 2
+        vwmul.vx    v10, v1, t6
+        addi        t2, a0, 2 * 4 * 16 * 2
+        vwmul.vx    v11, v3, t4
+        addi        t3, a0, 3 * 4 * 16 * 2
+        vwadd.vv    v4, v0, v2
+        vwsub.vv    v5, v0, v2
+        vsetvli     zero, zero, e32, m1, ta, ma
+        vmul.vx     v4, v4, t5
+        sd          zero,   (a1)
+        vadd.vv     v7, v10, v11
+        sd          zero,  8(a1)
+        vmul.vx     v5, v5, t5
+        sd          zero, 16(a1)
+        vsub.vv     v6, v8, v9
+        sd          zero, 24(a1)
+        vadd.vv     v0, v4, v7
+        addi        sp, sp, 16 * 2
+        vsub.vv     v3, v4, v7
+        vadd.vv     v1, v5, v6
+        vsub.vv     v2, v5, v6
+        vsetvli     zero, zero, e16, mf2, ta, ma
+        vnclip.wi   v4, v0, 18
+        vnclip.wi   v5, v1, 18
+        vnclip.wi   v6, v2, 18
+        vnclip.wi   v7, v3, 18
+        vsse16.v    v4, (a0), t0
+        vsse16.v    v5, (t1), t0
+        vsse16.v    v6, (t2), t0
+        vsse16.v    v7, (t3), t0
+
+        ret
+endfunc
+#endif
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 8624c3ae15..88bb67f78d 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -712,6 +712,10 @@ av_cold void ff_vp7dsp_init(VP8DSPContext *dsp)
 
     dsp->vp8_v_loop_filter_simple = vp7_v_loop_filter_simple_c;
     dsp->vp8_h_loop_filter_simple = vp7_h_loop_filter_simple_c;
+
+#if ARCH_RISCV
+    ff_vp7dsp_init_riscv(dsp);
+#endif
 }
 #endif /* CONFIG_VP7_DECODER */
 
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
index 3bf12b6b45..e3de2e0494 100644
--- a/libavcodec/vp8dsp.h
+++ b/libavcodec/vp8dsp.h
@@ -90,6 +90,8 @@ void ff_vp78dsp_init_ppc(VP8DSPContext *c);
 void ff_vp78dsp_init_riscv(VP8DSPContext *c);
 void ff_vp78dsp_init_x86(VP8DSPContext *c);
 
+void ff_vp7dsp_init_riscv(VP8DSPContext *c);
+
 void ff_vp8dsp_init(VP8DSPContext *c);
 void ff_vp8dsp_init_aarch64(VP8DSPContext *c);
 void ff_vp8dsp_init_arm(VP8DSPContext *c);
-- 
2.45.1



More information about the ffmpeg-devel mailing list