[FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
Rémi Denis-Courmont
remi at remlab.net
Tue May 7 19:08:56 EEST 2024
Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST uk7b at foxmail.com a écrit :
> From: sunyuechi <sunyuechi at iscas.ac.cn>
>
> C908:
> vp9_hor_8x8_8bpp_c: 74.7
> vp9_hor_8x8_8bpp_rvv_i32: 35.7
> vp9_hor_16x16_8bpp_c: 175.5
> vp9_hor_16x16_8bpp_rvv_i32: 80.2
> vp9_hor_32x32_8bpp_c: 510.2
> vp9_hor_32x32_8bpp_rvv_i32: 264.0
> ---
> libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 6 ++++
> libavcodec/riscv/vp9dsp_init.c | 3 ++
> 3 files changed, 65 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -113,3 +113,59 @@ func_dc dc_left 8 left 3 0 zve64x
> func_dc dc_top 32 top 5 1 zve32x
> func_dc dc_top 16 top 4 1 zve32x
> func_dc dc_top 8 top 3 0 zve64x
> +
> +func ff_h_32x32_rvv, zve32x
> + li t0, 32
> + addi a2, a2, 31
> + vsetvli zero, t0, e8, m2, ta, ma
> +
> + .rept 2
> + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + .endr
Do you gain much by unrolling all the way to 16x? Given that you have the
counter value already in t0, it should not make much difference to just unroll
2x or maybe 4x and then loop.
It might also be faster to use lhu or lwu and shift to reduce scalar loads, at
least if the vector is suitably aligned.
> +
> + ret
> +endfunc
> +
> +func ff_h_16x16_rvv, zve32x
> + addi a2, a2, 15
> + vsetivli zero, 16, e8, m1, ta, ma
> +
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + vse8.v v23, (a0)
> +
> + ret
> +endfunc
> +
> +func ff_h_8x8_rvv, zve32x
> + addi a2, a2, 7
> + vsetivli zero, 8, e8, mf2, ta, ma
> +
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 8, 9, 10, 11, 12, 13, 14
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + vse8.v v15, (a0)
> +
> + ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index b8ff282f8a..0ad961c7e0 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
>
> #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -59,6 +59,9 @@ static av_cold void
> vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +
> dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; +
> dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; +
> dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
> }
> #endif
> #endif
--
Rémi Denis-Courmont
http://www.remlab.net/
More information about the ffmpeg-devel
mailing list