[FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v
Rémi Denis-Courmont
remi at remlab.net
Sat May 25 13:17:46 EEST 2024
Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b at foxmail.com a écrit :
> From: sunyuechi <sunyuechi at iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7
> vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0
> vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2
> vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5
> vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2
> vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++----
> libavcodec/riscv/vp9dsp_init.c | 38 +++++-
> 3 files changed, 328 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 739380d9a9..adba4afb90 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> endfunc
> .endm
>
> +const subpel_filters_regular
> + .byte 0, 0, 0, 128, 0, 0, 0, 0
> + .byte 0, 1, -5, 126, 8, -3, 1, 0
> + .byte -1, 3, -10, 122, 18, -6, 2, 0
> + .byte -1, 4, -13, 118, 27, -9, 3, -1
> + .byte -1, 4, -16, 112, 37, -11, 4, -1
> + .byte -1, 5, -18, 105, 48, -14, 4, -1
> + .byte -1, 5, -19, 97, 58, -16, 5, -1
> + .byte -1, 6, -19, 88, 68, -18, 5, -1
> + .byte -1, 6, -19, 78, 78, -19, 6, -1
> + .byte -1, 5, -18, 68, 88, -19, 6, -1
> + .byte -1, 5, -16, 58, 97, -19, 5, -1
> + .byte -1, 4, -14, 48, 105, -18, 5, -1
> + .byte -1, 4, -11, 37, 112, -16, 4, -1
> + .byte -1, 3, -9, 27, 118, -13, 4, -1
> + .byte 0, 2, -6, 18, 122, -10, 3, -1
> + .byte 0, 1, -3, 8, 126, -5, 1, 0
> +subpel_filters_sharp:
> + .byte 0, 0, 0, 128, 0, 0, 0, 0
> + .byte -1, 3, -7, 127, 8, -3, 1, 0
> + .byte -2, 5, -13, 125, 17, -6, 3, -1
> + .byte -3, 7, -17, 121, 27, -10, 5, -2
> + .byte -4, 9, -20, 115, 37, -13, 6, -2
> + .byte -4, 10, -23, 108, 48, -16, 8, -3
> + .byte -4, 10, -24, 100, 59, -19, 9, -3
> + .byte -4, 11, -24, 90, 70, -21, 10, -4
> + .byte -4, 11, -23, 80, 80, -23, 11, -4
> + .byte -4, 10, -21, 70, 90, -24, 11, -4
> + .byte -3, 9, -19, 59, 100, -24, 10, -4
> + .byte -3, 8, -16, 48, 108, -23, 10, -4
> + .byte -2, 6, -13, 37, 115, -20, 9, -4
> + .byte -2, 5, -10, 27, 121, -17, 7, -3
> + .byte -1, 3, -6, 17, 125, -13, 5, -2
> + .byte 0, 1, -3, 8, 127, -7, 3, -1
> +subpel_filters_smooth:
> + .byte 0, 0, 0, 128, 0, 0, 0, 0
> + .byte -3, -1, 32, 64, 38, 1, -3, 0
> + .byte -2, -2, 29, 63, 41, 2, -3, 0
> + .byte -2, -2, 26, 63, 43, 4, -4, 0
> + .byte -2, -3, 24, 62, 46, 5, -4, 0
> + .byte -2, -3, 21, 60, 49, 7, -4, 0
> + .byte -1, -4, 18, 59, 51, 9, -4, 0
> + .byte -1, -4, 16, 57, 53, 12, -4, -1
> + .byte -1, -4, 14, 55, 55, 14, -4, -1
> + .byte -1, -4, 12, 53, 57, 16, -4, -1
> + .byte 0, -4, 9, 51, 59, 18, -4, -1
> + .byte 0, -4, 7, 49, 60, 21, -3, -2
> + .byte 0, -4, 5, 46, 62, 24, -3, -2
> + .byte 0, -4, 4, 43, 63, 26, -2, -2
> + .byte 0, -3, 2, 41, 63, 29, -2, -2
> + .byte 0, -3, 1, 38, 64, 32, -1, -3
> +endconst
Is there a reason that you cannot use the tables from C code?
> +
> +.macro epel_filter name type regtype
> + lla \regtype\()2, subpel_filters_\name
It should be possible to spare one ADDI by using just AUIPC here, and folding
the immediate offset into the LB's below (see also H.263 loop filter).
> + li \regtype\()1, 8
> +.ifc \type,v
> + mul \regtype\()0, a6, \regtype\()1
> +.else
> + mul \regtype\()0, a5, \regtype\()1
slli 3 ?
> +.endif
> + add \regtype\()0, \regtype\()0, \regtype\()2
> + .irp n,1,2,3,4,5,6
> + lb \regtype\n, \n(\regtype\()0)
> + .endr
> +.ifc \regtype,t
> + lb a7, 7(\regtype\()0)
> +.else
> + lb s7, 7(\regtype\()0)
> +.endif
> + lb \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst len op name type from_mem regtype
> + li a5, 64
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + sub a2, a2, a3
> + vle8.v v20, (a2)
> + sh1add a2, a3, a2
> + vle8.v v24, (a2)
> + add a2, a2, a3
> + vle8.v v26, (a2)
> + add a2, a2, a3
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a2, a2, -1
> + vle8.v v20, (a2)
> + addi a2, a2, 2
> + vle8.v v24, (a2)
> + addi a2, a2, 1
> + vle8.v v26, (a2)
> + addi a2, a2, 1
> + vle8.v v28, (a2)
> + addi a2, a2, 1
> + vle8.v v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + .rept 6
> + sub a2, a2, a3
> + .endr
> + vle8.v v28, (a2)
> + sub a2, a2, a3
> + vle8.v v26, (a2)
> + sh1add a2, a3, a2
> + add a2, a2, a3
> +.else
> + addi a2, a2, -6
> + vle8.v v28, (a2)
> + addi a2, a2, -1
> + vle8.v v26, (a2)
> + addi a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + vwadd.wx v16, v16, a5
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst len op name type from_mem regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len op name type vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> .irp op, put, avg
> bilin_h_v \len, \op, h, a5
> bilin_h_v \len, \op, v, a6
> + .irp name, regular, sharp, smooth
AFAICT, regular and sharp are identical, except for the base address of the
filter table, so it should be possible to share the byte code. Similarly, it
should be possible to share most of the horizontal and vertical code (maybe
also for bilinear. not just EPel) with separate load/store then inner
procedures. The H.263 loop filter already does that though with almost no
overhead, though
H.263 is obviously simpler than VP9.
A French philosopher famously said that Perfect is the ennemy of Good.
Generally, as with VVC, nested repetition macros for finely specialised
functions tend to generate way too much byte code, and this ends up being
worse rather than better in the big picture.
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> .endr
> .endr
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen) \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ +
> ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..314a1e5808 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + if (ff_rv_vlen_least(128)) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (ff_rv_vlen_least(256)) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> }
> + }
> +
> #endif
> #endif
> }
--
Rémi Denis-Courmont
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list