[FFmpeg-devel] [PATCH] lavc/aarch64: add pred16x16 10-bit functions

Thu Apr 8 23:55:48 EEST 2021

On Thu, 8 Apr 2021, Mikhail Nitenko wrote:

> here are the benchmarks https://0x1.st/kX.txt
> ---
> libavcodec/aarch64/h264pred_init.c |  75 +++++++++++-------
> libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+), 30 deletions(-)
>
> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index 213b40b3e7..633b401d59 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1
>         dup             v1.8b,  v1.b[0]
>         b               .L_pred8x8_dc_end
> endfunc
> +
> +.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
> +.if \n >= 4 || \hi == 0
> +        ld1             {\rd\().h}[0],  [\rs], \rt
> +        ld1             {\rd\().h}[1],  [\rs], \rt
> +.endif
> +.if \n >= 4 || \hi == 1
> +        ld1             {\rd\().h}[2],  [\rs], \rt
> +        ld1             {\rd\().h}[3],  [\rs], \rt
> +.endif
> +.if \n == 8
> +        ld1             {\rd\().h}[4],  [\rs], \rt
> +        ld1             {\rd\().h}[5],  [\rs], \rt
> +        ld1             {\rd\().h}[6],  [\rs], \rt
> +        ld1             {\rd\().h}[7],  [\rs], \rt
> +.endif
> +.endm

I believe this could be a bit faster by using two alternating registers 
that are incremented - but as the existing code doesn't do that, it's not 
necessary.

> +
> +function ff_pred16x16_128_dc_neon_10, export=1
> +        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
> +
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_top_dc_neon_10, export=1
> +        sub             x2,  x0,  x1
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2]

This can be one single instruction, ld1 {v0.8h, v1.8h}, [x2]

> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

When adding up 8 elements that are 10 bit, they still fit in 16 bit (it 
only requires 13 bit), so you don't need uaddlv here, addv would be 
better. And when adding the two results, it still fits in 16 bit (then 
it'd use 14 bits).

> +
> +        add             v0.2s, v0.2s, v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_left_dc_neon_10, export=1
> +        sub             x2,  x0,  #2 // access to the "left" column
> +        ldcol.16        v0,  x2,  x1,  8
> +        ldcol.16        v1,  x2,  x1,  8 // load "left" column
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h

Same thing here, addv+addv should be enough

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +
> +        rshrn           v0.4h,  v0.4s,  #4
> +        dup             v0.8h, v0.h[0]
> +        b               .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_dc_neon_10, export=1
> +        sub             x2,  x0,  x1 // access to the "top" row
> +        sub             x3,  x0,  #2 // access to the "left" column
> +
> +        ld1             {v0.8h}, [x2], #16
> +        ld1             {v1.8h}, [x2]

One single ld1 {v0.8h, v1.8h}

> +        ldcol.16        v2,  x3,  x1,  8
> +        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" col and "left" row
> +
> +        uaddlv          s0,  v0.8h
> +        uaddlv          s1,  v1.8h
> +        uaddlv          s2,  v2.8h // sum all pixels in the "top" row and "left" col
> +        uaddlv          s3,  v3.8h // (sum stays in v0-v3 registers)

addv

> +
> +        add             v0.2s,  v0.2s,  v1.2s
> +        add             v0.2s,  v0.2s,  v2.2s
> +        add             v0.2s,  v0.2s,  v3.2s // sum registers v0-v3
> +
> +        rshrn           v0.4h,  v0.4s,  #5 // right shift vector
> +        dup             v0.8h,  v0.h[0] // fill vector with 0th value (dcsplat)

These comments are kinda pointless here

> +.L_pred16x16_dc_10_end:
> +        sub             x1,  x1,  #16
> +        mov             w3,  #8
> +6:      st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1

This can be one single "st1 {v0.8h, v1.8h}, [x0], x1" if you make sure 
that v1 contains the same

> +        st1             {v0.8h}, [x0], #16
> +        st1             {v0.8h}, [x0], x1
> +
> +        subs            w3,  w3,  #1
> +        b.ne            6b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_hor_neon_10, export=1
> +        sub             x2,  x0,  #2
> +        sub             x3,  x1,  #16
> +
> +        mov             w4,  #16
> +1:      ld1r            {v0.8h},  [x2],  x1
> +        st1             {v0.8h},  [x0],  #16
> +        st1             {v0.8h},  [x0],  x3

This might be ok here, but also do check if copying the value to v1 and 
doing one single "st1 {v0.8h, v1.8h}" is faster.

> +
> +        subs            w4,  w4,  #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred16x16_vert_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        add             x1,  x1,  x1
> +        sub             x1,  x1,  #16
> +
> +        ld1             {v0.8h},  [x2], #16
> +        ld1             {v1.8h},  [x2], x1

One single ld1

> +
> +        mov             w3,  #8
> +1:      st1             {v0.8h},  [x0],  #16
> +        st1             {v1.8h},  [x0],  x1

One single st1

// Martin