[FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv

flow gg hlefthleft at gmail.com
Tue May 7 05:31:53 EEST 2024


> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.

If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.

> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.

Ok, fixed it

Rémi Denis-Courmont <remi at remlab.net> 于2024年5月7日周二 03:25写道:

> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b at foxmail.com a écrit :
> > From: sunyuechi <sunyuechi at iscas.ac.cn>
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 ++++
> >  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
> >  2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >          c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +        c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >      }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >          .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -        lla             t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +        lla             \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -        addi            t0, a6, -1
> > +        addi            \regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> >  .elseif \type == h
> > -        addi            t0, a5, -1
> > +        addi            \regtype\()0, a5, -1
> >  .endif
> > -        li              t1, 6
> > -        mul             t0, t0, t1
> > -        add             t0, t0, t2
> > +        li              \regtype\()1, 6
> > +        mul             \regtype\()0, \regtype\()0, \regtype\()1
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> >          .irp n 1,2,3,4
> > -        lb              t\n, \n(t0)
> > +        lb              \regtype\n, \n(\regtype\()0)
> >          .endr
> >  .ifc \size,6
> > -        lb              t5, 5(t0)
> > -        lb              t0, (t0)
> > +        lb              \regtype\()5, 5(\regtype\()0)
> > +        lb              \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >          mv              a5, a3
> >  .else
> > @@ -189,24 +189,35 @@ endconst
> >          sub             t6, a2, a5
> >          add             a7, a2, a5
> >
> > +.if \from_mem
> >          vle8.v          v24, (a2)
> >          vle8.v          v22, (t6)
> >          vle8.v          v26, (a7)
> >          add             a7, a7, a5
> >          vle8.v          v28, (a7)
> > -        vwmulu.vx       v16, v24, t2
> > -        vwmulu.vx       v20, v26, t3
> > +        vwmulu.vx       v16, v24, \regtype\()2
> > +        vwmulu.vx       v20, v26, \regtype\()3
> >  .ifc \size,6
> >          sub             t6, t6, a5
> >          add             a7, a7, a5
> >          vle8.v          v24, (t6)
> >          vle8.v          v26, (a7)
> > -        vwmaccu.vx      v16, t0, v24
> > -        vwmaccu.vx      v16, t5, v26
> > +        vwmaccu.vx      v16, \regtype\()0, v24
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v22
> > +        vwmaccsu.vx     v16, \regtype\()4, v28
> > +.else
> > +        vwmulu.vx       v16, v4, \regtype\()2
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +        .ifc \size,6
> > +        vwmaccu.vx      v16, \regtype\()0, v0
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        .endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +        vwmaccsu.vx     v16, \regtype\()4, v8
> >  .endif
> >          li              t6, 64
> > -        vwmaccsu.vx     v16, t1, v22
> > -        vwmaccsu.vx     v16, t4, v28
> >          vwadd.wx        v16, v16, t6
> >          vsetvlstatic16  \len
> >          vwadd.vv        v24, v16, v20
> > @@ -216,18 +227,18 @@ endconst
> >          vnclipu.wi      \dst, v24, 0
> >  .endm
> >
> > -.macro epel_load_inc dst len size type
> > -        epel_load       \dst \len \size \type
> > +.macro epel_load_inc dst len size type from_mem regtype
> > +        epel_load       \dst \len \size \type \from_mem \regtype
> >          add             a2, a2, a3
> >  .endm
> >
> >  .macro epel len size type
> >  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> > -        epel_filter     \size \type
> > +        epel_filter     \size \type t
> >          vsetvlstatic8   \len
> >  1:
> >          addi            a4, a4, -1
> > -        epel_load_inc   v30 \len \size \type
> > +        epel_load_inc   v30 \len \size \type 1 t
> >          vse8.v          v30, (a0)
> >          add             a0, a0, a1
> >          bnez            a4, 1b
> > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> > zve32x endfunc
> >  .endm
> >
> > +.macro epel_hv len hsize vsize
> > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> > +        addi            sp, sp, -48
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
>
> This code actually requires ==, not >=.
>
> > +        sd              s\n, \n\()<<3(sp)
> > +#else
> > +        sw              s\n, \n\()<<3(sp)
>
> You can do that but you only need half the stack space and offsets.
>
> (And that's why I avoid S and FS registers like the plague, but sometimes
> you
> just can't.)
>
> > +#endif
> > +        .endr
> > +        sub             a2, a2, a3
> > +        epel_filter     \hsize h t
> > +        epel_filter     \vsize v s
> > +        vsetvlstatic8   \len
> > +.if \hsize == 6 || \vsize == 6
> > +        sub             a2, a2, a3
> > +        epel_load_inc   v0 \len \hsize h 1 t
> > +.endif
> > +        epel_load_inc   v2 \len \hsize h 1 t
> > +        epel_load_inc   v4 \len \hsize h 1 t
> > +        epel_load_inc   v6 \len \hsize h 1 t
> > +        epel_load_inc   v8 \len \hsize h 1 t
> > +.if \hsize == 6 || \vsize == 6
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.endif
> > +        addi            a4, a4, -1
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v0, v2
> > +.endif
> > +        vmv.v.v         v2, v4
> > +        vmv.v.v         v4, v6
> > +        vmv.v.v         v6, v8
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v8, v10
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.else
> > +        epel_load_inc   v8 \len 4 h 1 t
> > +.endif
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
> > +        ld              s\n, \n\()<<3(sp)
> > +#else
> > +        lw              s\n, \n\()<<3(sp)
> > +#endif
> > +        .endr
> > +        addi            sp, sp, 48
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len 16,8,4
> >  put_vp8_bilin_h_v \len h a5
> >  put_vp8_bilin_h_v \len v a6
> > @@ -244,4 +315,8 @@ epel \len 6 h
> >  epel \len 4 h
> >  epel \len 6 v
> >  epel \len 4 v
> > +epel_hv \len 6 6
> > +epel_hv \len 4 4
> > +epel_hv \len 6 4
> > +epel_hv \len 4 6
> >  .endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>


More information about the ffmpeg-devel mailing list