[FFmpeg-devel] [PATCH v2 1/4] lavc/vp8dsp: R-V V 256 bilin,epel
uk7b at foxmail.com
uk7b at foxmail.com
Sun Jul 14 19:28:21 EEST 2024
From: sunyuechi <sunyuechi at iscas.ac.cn>
X60 new
vp8_put_bilin16_h_c : 42.5 42.5
vp8_put_bilin16_h_rvv_i32 : 4.7 3.2
vp8_put_bilin16_hv_c : 71.5 71.7
vp8_put_bilin16_hv_rvv_i32 : 8.5 7.5
vp8_put_bilin16_v_c : 43.0 42.7
vp8_put_bilin16_v_rvv_i32 : 4.2 3.0
vp8_put_epel16_h4_c : 82.0 82.0
vp8_put_epel16_h4_rvv_i32 : 12.2 9.7
vp8_put_epel16_h6v6_c : 196.2 196.2
vp8_put_epel16_h6v6_rvv_i32 : 31.2 26.2
vp8_put_epel16_v4_c : 82.2 82.2
vp8_put_epel16_v4_rvv_i32 : 12.0 10.0
...
---
libavcodec/riscv/vp8dsp_init.c | 136 +++++++++++++++++++--------------
libavcodec/riscv/vp8dsp_rvv.S | 133 +++++++++++++++++++-------------
2 files changed, 162 insertions(+), 107 deletions(-)
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index d9e2beb237..dcb6307d5b 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -35,13 +35,19 @@ void ff_vp8_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t str
VP8_EPEL(16, rvi);
VP8_EPEL(8, rvi);
VP8_EPEL(4, rvi);
-VP8_EPEL(16, rvv);
-VP8_EPEL(8, rvv);
-VP8_EPEL(4, rvv);
+VP8_EPEL(16, rvv128);
+VP8_EPEL(8, rvv128);
+VP8_EPEL(4, rvv128);
+VP8_EPEL(16, rvv256);
+VP8_EPEL(8, rvv256);
+VP8_EPEL(4, rvv256);
-VP8_BILIN(16, rvv);
-VP8_BILIN(8, rvv);
-VP8_BILIN(4, rvv);
+VP8_BILIN(16, rvv128);
+VP8_BILIN(8, rvv128);
+VP8_BILIN(4, rvv128);
+VP8_BILIN(16, rvv256);
+VP8_BILIN(8, rvv256);
+VP8_BILIN(4, rvv256);
av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
{
@@ -58,64 +64,82 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
}
#if HAVE_RVV
- if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
- c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv;
- c->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_rvv;
- c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
- c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
- c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
- c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
+#define init_bilin(vlen) \
+ c->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv##vlen; \
+ c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv##vlen;
- c->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_rvv;
- c->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_rvv;
- c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
- c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
- c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
- c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
+#define init_epel_h_v(vlen) \
+ c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv##vlen;
- c->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
- c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
+#define init_epel_hv(vlen) \
+ c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv##vlen; \
+ c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv##vlen;
- if (flags & AV_CPU_FLAG_RVB_ADDR) {
- c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
- c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
- c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
- c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
- c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
- c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
+ int vlenb = ff_get_rv_vlenb();
- c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
- c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
- c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
- c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
- c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
- c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+ if (flags & AV_CPU_FLAG_RVV_I32 && vlenb >= 32) {
+ init_bilin(256);
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_epel_h_v(256);
+#if __riscv_xlen <= 64
+ init_epel_hv(256);
+#endif
+ }
+ } else if (flags & AV_CPU_FLAG_RVV_I32 && vlenb >= 16) {
+ init_bilin(128);
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_epel_h_v(128);
#if __riscv_xlen <= 64
- c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
- c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
- c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
- c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
- c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
- c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
- c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
- c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
- c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
- c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
- c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
- c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
+ init_epel_hv(128);
#endif
}
}
+#undef init_bilin
+#undef init_epel_h_v
+#undef init_epel_hv
#endif
#endif
}
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index d366748a0a..0cbf1672f7 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -21,7 +21,8 @@
#include "libavutil/riscv/asm.S"
-.macro vsetvlstatic8 len
+.macro vsetvlstatic8 len, vlen
+.if \vlen == 128
.if \len <= 4
vsetivli zero, \len, e8, mf4, ta, ma
.elseif \len <= 8
@@ -31,9 +32,21 @@
.elseif \len <= 31
vsetivli zero, \len, e8, m2, ta, ma
.endif
+.else
+.if \len <= 4
+ vsetivli zero, \len, e8, mf8, ta, ma
+.elseif \len <= 8
+ vsetivli zero, \len, e8, mf4, ta, ma
+.elseif \len <= 16
+ vsetivli zero, \len, e8, mf2, ta, ma
+.elseif \len <= 31
+ vsetivli zero, \len, e8, m1, ta, ma
+.endif
+.endif
.endm
-.macro vsetvlstatic16 len
+.macro vsetvlstatic16 len, vlen
+.if \vlen == 128
.if \len <= 4
vsetivli zero, \len, e16, mf2, ta, ma
.elseif \len <= 8
@@ -41,6 +54,15 @@
.elseif \len <= 16
vsetivli zero, \len, e16, m2, ta, ma
.endif
+.else
+.if \len <= 4
+ vsetivli zero, \len, e16, mf4, ta, ma
+.elseif \len <= 8
+ vsetivli zero, \len, e16, mf2, ta, ma
+.elseif \len <= 16
+ vsetivli zero, \len, e16, m1, ta, ma
+.endif
+.endif
.endm
#if __riscv_xlen >= 64
@@ -263,10 +285,10 @@ endfunc
vnsra.wi \dst, v24, 3
.endm
-.macro put_vp8_bilin_h_v type mn
-func ff_put_vp8_bilin4_\type\()_rvv, zve32x
- vsetvlstatic8 4
-.Lbilin_\type:
+.macro put_vp8_bilin_h_v vlen, type, mn
+func ff_put_vp8_bilin4_\type\()_rvv\vlen, zve32x
+ vsetvlstatic8 4, \vlen
+.Lbilin_\type\vlen:
li t1, 8
li t4, 4
sub t1, t1, \mn
@@ -306,12 +328,10 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
endfunc
.endm
-put_vp8_bilin_h_v h a5
-put_vp8_bilin_h_v v a6
-
-func ff_put_vp8_bilin4_hv_rvv, zve32x
- vsetvlstatic8 4
-.Lbilin_hv:
+.macro put_vp8_bilin_hv vlen
+func ff_put_vp8_bilin4_hv_rvv\vlen, zve32x
+ vsetvlstatic8 4, \vlen
+.Lbilin_hv\vlen:
li t3, 8
sub t1, t3, a5
sub t2, t3, a6
@@ -332,23 +352,32 @@ func ff_put_vp8_bilin4_hv_rvv, zve32x
ret
endfunc
+.endm
+
+.irp vlen,256,128
+put_vp8_bilin_h_v \vlen, h, a5
+put_vp8_bilin_h_v \vlen, v, a6
+put_vp8_bilin_hv \vlen
+.endr
.irp len,16,8
-func ff_put_vp8_bilin\len\()_h_rvv, zve32x
- vsetvlstatic8 \len
- j .Lbilin_h
+.irp vlen,256,128
+func ff_put_vp8_bilin\len\()_h_rvv\vlen, zve32x
+ vsetvlstatic8 \len, \vlen
+ j .Lbilin_h\vlen
endfunc
-func ff_put_vp8_bilin\len\()_v_rvv, zve32x
- vsetvlstatic8 \len
- j .Lbilin_v
+func ff_put_vp8_bilin\len\()_v_rvv\vlen, zve32x
+ vsetvlstatic8 \len, \vlen
+ j .Lbilin_v\vlen
endfunc
-func ff_put_vp8_bilin\len\()_hv_rvv, zve32x
- vsetvlstatic8 \len
- j .Lbilin_hv
+func ff_put_vp8_bilin\len\()_hv_rvv\vlen, zve32x
+ vsetvlstatic8 \len, \vlen
+ j .Lbilin_hv\vlen
endfunc
.endr
+.endr
const subpel_filters
.byte 0, -6, 123, 12, -1, 0
@@ -378,7 +407,7 @@ endconst
.endif
.endm
-.macro epel_load dst len size type from_mem regtype
+.macro epel_load dst, len, vlen, size, type, from_mem, regtype
.ifc \type,v
sub t6, a2, a3
add a7, a2, a3
@@ -426,26 +455,26 @@ endconst
.endif
li t6, 64
vwadd.wx v16, v16, t6
- vsetvlstatic16 \len
+ vsetvlstatic16 \len, \vlen
vwadd.vv v24, v16, v20
vnsra.wi v24, v24, 7
vmax.vx v24, v24, zero
- vsetvlstatic8 \len
+ vsetvlstatic8 \len, \vlen
vnclipu.wi \dst, v24, 0
.endm
-.macro epel_load_inc dst len size type from_mem regtype
- epel_load \dst \len \size \type \from_mem \regtype
+.macro epel_load_inc dst, len, vlen, size, type, from_mem, regtype
+ epel_load \dst, \len, \vlen, \size, \type, \from_mem, \regtype
add a2, a2, a3
.endm
-.macro epel len size type
-func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
+.macro epel len, vlen, size, type
+func ff_put_vp8_epel\len\()_\type\()\size\()_rvv\vlen, zve32x
epel_filter \size \type t
- vsetvlstatic8 \len
+ vsetvlstatic8 \len, \vlen
1:
addi a4, a4, -1
- epel_load_inc v30 \len \size \type 1 t
+ epel_load_inc v30, \len, \vlen, \size, \type, 1, t
vse8.v v30, (a0)
add a0, a0, a1
bnez a4, 1b
@@ -454,8 +483,8 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
endfunc
.endm
-.macro epel_hv len hsize vsize
-func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+.macro epel_hv len, vlen, hsize, vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv\vlen, zve32x
#if __riscv_xlen == 64
addi sp, sp, -48
.irp n,0,1,2,3,4,5
@@ -470,22 +499,22 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
sub a2, a2, a3
epel_filter \hsize h t
epel_filter \vsize v s
- vsetvlstatic8 \len
+ vsetvlstatic8 \len, \vlen
.if \hsize == 6 || \vsize == 6
sub a2, a2, a3
- epel_load_inc v0 \len \hsize h 1 t
+ epel_load_inc v0, \len, \vlen, \hsize, h, 1, t
.endif
- epel_load_inc v2 \len \hsize h 1 t
- epel_load_inc v4 \len \hsize h 1 t
- epel_load_inc v6 \len \hsize h 1 t
- epel_load_inc v8 \len \hsize h 1 t
+ epel_load_inc v2, \len, \vlen, \hsize, h, 1, t
+ epel_load_inc v4, \len, \vlen, \hsize, h, 1, t
+ epel_load_inc v6, \len, \vlen, \hsize, h, 1, t
+ epel_load_inc v8, \len, \vlen, \hsize, h, 1, t
.if \hsize == 6 || \vsize == 6
- epel_load_inc v10 \len \hsize h 1 t
+ epel_load_inc v10, \len, \vlen, \hsize, h, 1, t
.endif
addi a4, a4, -1
1:
addi a4, a4, -1
- epel_load v30 \len \vsize v 0 s
+ epel_load v30, \len, \vlen, \vsize, v, 0, s
vse8.v v30, (a0)
.if \hsize == 6 || \vsize == 6
vmv.v.v v0, v2
@@ -495,13 +524,13 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
vmv.v.v v6, v8
.if \hsize == 6 || \vsize == 6
vmv.v.v v8, v10
- epel_load_inc v10 \len \hsize h 1 t
+ epel_load_inc v10, \len, \vlen, \hsize, h, 1, t
.else
- epel_load_inc v8 \len 4 h 1 t
+ epel_load_inc v8, \len, \vlen, 4, h, 1, t
.endif
add a0, a0, a1
bnez a4, 1b
- epel_load v30 \len \vsize v 0 s
+ epel_load v30, \len, \vlen, \vsize, v, 0, s
vse8.v v30, (a0)
#if __riscv_xlen == 64
@@ -521,14 +550,16 @@ endfunc
.endm
.irp len,16,8,4
-epel \len 6 h
-epel \len 4 h
-epel \len 6 v
-epel \len 4 v
+.irp vlen,256,128
+epel \len, \vlen, 6, h
+epel \len, \vlen, 4, h
+epel \len, \vlen, 6, v
+epel \len, \vlen, 4, v
#if __riscv_xlen <= 64
-epel_hv \len 6 6
-epel_hv \len 4 4
-epel_hv \len 6 4
-epel_hv \len 4 6
+epel_hv \len, \vlen, 6, 6
+epel_hv \len, \vlen, 4, 4
+epel_hv \len, \vlen, 6, 4
+epel_hv \len, \vlen, 4, 6
#endif
.endr
+.endr
--
2.45.2
More information about the ffmpeg-devel
mailing list