[FFmpeg-devel] [PATCH v2 4/4] lavc/vp8dsp: R-V V loop_filter
uk7b at foxmail.com
uk7b at foxmail.com
Sun Jul 14 19:28:24 EEST 2024
From: sunyuechi <sunyuechi at iscas.ac.cn>
C908 X60
vp8_loop_filter8uv_h_c : 12.2 10.0
vp8_loop_filter8uv_h_rvv_i32 : 11.5 9.7
vp8_loop_filter8uv_v_c : 13.2 11.2
vp8_loop_filter8uv_v_rvv_i32 : 8.0 6.5
vp8_loop_filter16y_h_c : 11.7 10.5
vp8_loop_filter16y_h_rvv_i32 : 9.2 7.2
vp8_loop_filter16y_v_c : 11.5 10.5
vp8_loop_filter16y_v_rvv_i32 : 5.5 3.7
---
libavcodec/riscv/vp8dsp_init.c | 6 +++-
libavcodec/riscv/vp8dsp_rvv.S | 59 ++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 8cb21b8ceb..c53223a0e8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -158,7 +158,11 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv##vlen; \
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv##vlen; \
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv##vlen; \
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_rvv##vlen;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_rvv##vlen; \
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv##vlen; \
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv##vlen; \
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv##vlen; \
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_rvv##vlen;
int flags = av_get_cpu_flags();
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 036872a29e..298910ea90 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -390,12 +390,43 @@ endfunc
.if \normal
vmand.mm v0, v1, v3 // vp8_normal_limit & !hv
+ .if \inner
vnclip.wi v22, v22, 0 // clip_int8(a);
filter_fmin \len, \vlen, v22, v12, v26, v10, v24, v20
vadd.vi v12, v12, 1
vsra.vi v12, v12, 1 // (f1 + 1) >> 1;
vadd.vv v8, v8, v12 // p1 + a
vsub.vv v14, v14, v12 // q1 - a
+ .else
+ li t6, 27
+ li a7, 18
+ li a6, 9
+ vwmul.vx v22, v28, t6
+ vwmul.vx v4, v28, a7
+ vwmul.vx v26, v28, a6
+ vsetvlstatic16 \len, \vlen
+ li a7, 63
+ vzext.vf2 v18, v2 // p2
+ vzext.vf2 v28, v7 // q2
+ vadd.vx v22, v22, a7
+ vadd.vx v4, v4, a7
+ vadd.vx v26, v26, a7
+ vsra.vi v22, v22, 7 // a0
+ vsra.vi v4, v4, 7 // a1
+ vsra.vi v26, v26, 7 // a2
+ vadd.vv v18, v18, v26
+ vadd.vv v8, v8, v4
+ vadd.vv v30, v24, v22
+ vsub.vv v28, v28, v26
+ vsub.vv v10, v20, v22
+ vsub.vv v14, v14, v4
+ vmax.vx v18, v18, zero
+ vmax.vx v8, v8, zero
+ vmax.vx v26, v30, zero
+ vmax.vx v10, v10, zero
+ vmax.vx v14, v14, zero
+ vmax.vx v28, v28, zero
+ .endif
vmax.vx v8, v8, zero
vmax.vx v14, v14, zero
@@ -404,14 +435,27 @@ endfunc
vnclipu.wi v4, v26, 0 // -1
vnclipu.wi v5, v10, 0 // 0
vnclipu.wi v6, v14, 0 // 1
+ .if !\inner
+ vnclipu.wi v2, v18, 0
+ vnclipu.wi v7, v28, 0
+ addi t0, \dst, -3
+ .endif
.ifc \type,v
vse8.v v3, (t2), v0.t // -2
vse8.v v4, (t3), v0.t // -1
vse8.v v5, (\dst), v0.t // 0
vse8.v v6, (t4), v0.t // 1
+ .if !\inner
+ vse8.v v2, (t1), v0.t
+ vse8.v v7, (t5), v0.t
+ .endif
.else
+ .if \inner
vssseg4e8.v v3, (t2), \stride, v0.t
+ .else
+ vssseg6e8.v v2, (t0), \stride, v0.t
+ .endif
.endif
.endif
.endm
@@ -439,6 +483,21 @@ func ff_vp8_\type\()_loop_filter8uv_inner_rvv\vlen, zve32x
filter 8, \vlen, \type, 1, 1, a1, a2, a3, a4, a5
ret
endfunc
+
+func ff_vp8_\type\()_loop_filter16_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 16, \vlen
+ filter 16, \vlen, \type, 1, 0, a0, a1, a2, a3, a4
+ ret
+endfunc
+
+func ff_vp8_\type\()_loop_filter8uv_rvv\vlen, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 8, \vlen
+ filter 8, \vlen, \type, 1, 0, a0, a2, a3, a4, a5
+ filter 8, \vlen, \type, 1, 0, a1, a2, a3, a4, a5
+ ret
+endfunc
.endr
.endr
--
2.45.2
More information about the ffmpeg-devel
mailing list