[FFmpeg-devel] [PATCH] lavc/aarch64: add pred functions for 10-bit

Martin Storsjö martin at martin.st
Wed Aug 4 16:08:19 EEST 2021


On Fri, 16 Jul 2021, Mikhail Nitenko wrote:

> Benchmarks:                        A53     A72
> pred8x8_dc_10_c:                   64.2    49.5
> pred8x8_dc_10_neon:                62.7    54.5
> pred8x8_dc_128_10_c:               26.0    15.5
> pred8x8_dc_128_10_neon:            28.2    16.0
> pred8x8_horizontal_10_c:           60.0    27.7
> pred8x8_horizontal_10_neon:        34.2    27.7
> pred8x8_left_dc_10_c:              42.5    27.5
> pred8x8_left_dc_10_neon:           50.7    41.2
> pred8x8_mad_cow_dc_0l0_10_c:       55.7    37.2
> pred8x8_mad_cow_dc_0l0_10_neon:    46.0    36.5
> pred8x8_mad_cow_dc_0lt_10_c:       89.2    67.0
> pred8x8_mad_cow_dc_0lt_10_neon:    50.2    46.7
> pred8x8_mad_cow_dc_l0t_10_c:       75.5    51.0
> pred8x8_mad_cow_dc_l0t_10_neon:    49.7    44.7
> pred8x8_mad_cow_dc_l00_10_c:       58.0    38.0
> pred8x8_mad_cow_dc_l00_10_neon:    41.0    37.5
> pred8x8_plane_10_c:               347.5   288.7
> pred8x8_plane_10_neon:            150.2   108.5
> pred8x8_top_dc_10_c:               44.5    30.5
> pred8x8_top_dc_10_neon:            39.7    31.5
> pred8x8_vertical_10_c:             27.5    16.0
> pred8x8_vertical_10_neon:          27.7    15.0
> pred16x16_plane_10_c:            1245.5  1069.7
> pred16x16_plane_10_neon:          349.0   208.7
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
> libavcodec/aarch64/h264pred_init.c |  40 +++-
> libavcodec/aarch64/h264pred_neon.S | 369 ++++++++++++++++++++++++++++-
> 2 files changed, 402 insertions(+), 7 deletions(-)

> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index e40bdc8d53..735d20b49c 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -467,3 +475,356 @@ function ff_pred16x16_vert_neon_10, export=1
>         b.ne            1b
>         ret
> endfunc
> +
> +function ff_pred16x16_plane_neon_10, export=1
> +        sub             x3,  x0,  x1
> +        movrel          x4,  p16weight_10
> +        add             x2,  x3,  #16
> +        sub             x3,  x3,  #2
> +
> +        ld1             {v0.8h},  [x3]
> +        ld1             {v2.8h},  [x2]
> +        ldcol.16        v1,  x3,  x1, 8
> +        add             x3,  x3,  x1
> +        ldcol.16        v3,  x3,  x1, 8
> +
> +        rev64           v16.8h,  v0.8h
> +        trn1            v0.2d,   v16.2d,  v16.2d
> +        trn2            v0.2d,   v16.2d,  v0.2d
> +
> +        rev64           v16.8h,  v1.8h
> +        trn1            v1.2d,   v16.2d,  v16.2d
> +        trn2            v1.2d,   v16.2d,  v1.2d
> +

Umm, these trn1+trn2 are really confusing to try to figure out here. Do 
you want to swap the two halfs of the register, to compensate for not 
having a rev128? You can do that with "ext v0.16b, v0.16b, v0.16b, #8" 
instead of these two instructions. (And it's better for pipelining to do 
two rev64 followed by two ext, instead of interleaving them tightly.)

> +        uaddl           v7.4s,  v2.4h,  v3.4h

I don't think you need to go to 32 bit here? If you add two 10 bit pixels 
together, the sum (11 bit) still fits in 16 bit elements just fine. (I 
haven't checked how large the intermediates become further in this 
calculation here, whether you need to go to 32 bit somewhere close to the 
end of the calculation or if you can do it all in 16 bit.)

The same applies to the 8x8 version below too.

> +        uaddl2          v16.4s, v2.8h,  v3.8h
> +        usubl           v4.4s,  v2.4h,  v0.4h
> +        usubl2          v5.4s,  v2.8h,  v0.8h
> +        usubl           v2.4s,  v3.4h,  v1.4h
> +        usubl2          v3.4s,  v3.8h,  v1.8h
> +
> +        ld1             {v0.4s, v1.4s},  [x4]
> +
> +        mul             v4.4s,  v4.4s,  v0.4s
> +        mul             v5.4s,  v5.4s,  v1.4s
> +        mul             v2.4s,  v2.4s,  v0.4s
> +        mul             v3.4s,  v3.4s,  v1.4s
> +
> +        addp            v4.4s,  v4.4s,  v5.4s
> +        addp            v2.4s,  v2.4s,  v3.4s
> +
> +        addp            v4.4s,  v4.4s,  v4.4s
> +        addp            v2.4s,  v2.4s,  v2.4s
> +
> +        addp            v4.2s,  v4.2s,  v4.2s
> +        addp            v2.2s,  v2.2s,  v2.2s
> +        mov             v2.s[0],  v4.s[0]       // H and V

I haven't really studied this in detail, but why do you need to do 
elementwise fiddling here, when it isn't needed in the 8 bit version of 
the function?

> +
> +        sshll           v3.2d,  v2.2s,  #2
> +        saddw           v2.2d,  v3.2d,  v2.2s
> +        rshrn           v4.2s,  v2.2d,  #6
> +        dup             v5.4s,  v4.s[1]
> +
> +        add             v2.2s,  v4.2s,  v5.2s
> +        shl             v3.4s,  v2.4s,  #3
> +
> +        mov             w2,  v7.s[0]
> +        mov             v7.s[0],  v16.s[3]
> +        mov             v16.s[3],  w2

Same here, there's no corresponding elementwise fiddling in the 8 bit 
version, so I don't think it should be needed here either?

> +
> +        sub             v3.4s,  v3.4s,  v2.4s   // 7 * (b + c)
> +        add             v7.4s,  v7.4s,  v0.4s
> +
> +        shl             v2.4s,  v7.4s,  #4
> +        sub             v2.4s,  v2.4s,  v3.4s
> +        shl             v3.4s,  v4.4s,  #4
> +
> +        movrel          x5,  p16weight_10_new
> +        ld1             {v0.4s, v1.4s},  [x5]

The 8 bit version uses an "ext; mov v0.h[0], wzr" instead of loading a 
whole new set of constants here. Would that work here too, or have you 
lost the original constant?

> +
> +        sub             v6.4s,  v5.4s,  v3.4s
> +        mul             v0.4s,  v0.4s,  v4.s[0]
> +        mul             v1.4s,  v1.4s,  v4.s[0]
> +        dup             v16.4s,  v2.s[0]
> +        dup             v17.4s,  v2.s[0]
> +        dup             v18.4s,  v4.s[0]
> +        dup             v19.4s,  v4.s[0]
> +        dup             v20.4s,  v6.s[0]
> +        dup             v21.4s,  v6.s[0]
> +        shl             v18.4s,  v18.4s,  #3
> +        shl             v19.4s,  v19.4s,  #3
> +        add             v16.4s,  v16.4s,  v0.4s
> +        add             v17.4s,  v17.4s,  v1.4s
> +        add             v20.4s,  v20.4s,  v18.4s
> +        add             v21.4s,  v21.4s,  v19.4s
> +        mov             w3,  #16
> +        mov             w2,  #1023              // for clipping
> +        dup             v3.8h,  w2

Instead of mov+dup, you can load this constant with "mvni #0xFC, lsl #8" 
which equals to loading 0x3ff.


> +1:
> +        sqshrun         v0.4h,  v16.4s,  #5
> +        sqshrun2        v0.8h,  v17.4s,  #5
> +
> +        add             v16.4s,  v16.4s,  v18.4s
> +        add             v17.4s,  v17.4s,  v19.4s
> +
> +        sqshrun         v1.4h,  v16.4s,  #5
> +        sqshrun2        v1.8h,  v17.4s,  #5
> +
> +        add             v16.4s,  v16.4s,  v20.4s
> +        add             v17.4s,  v17.4s,  v21.4s
> +
> +        subs            w3,  w3,  #1
> +
> +        smin            v0.8h,  v0.8h,  v3.8h
> +        smin            v1.8h,  v1.8h,  v3.8h
> +        st1             {v0.8h, v1.8h}, [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +
> +function ff_pred8x8_hor_neon_10, export=1
> +        sub             x2,  x0,  #2
> +        mov             w3,  #8
> +
> +1:      ld1r            {v0.8h},  [x2], x1
> +        subs            w3,  w3,  #1
> +        st1             {v0.8h},  [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred8x8_vert_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        lsl             x1,  x1,  #1
> +
> +        ld1             {v0.8h},  [x2], x1
> +        mov             w3,  #4
> +1:      subs            w3,  w3,  #1
> +        st1             {v0.8h},  [x0], x1
> +        st1             {v0.8h},  [x2], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_pred8x8_plane_neon_10, export=1
> +        sub             x3,  x0,  x1
> +        movrel          x4,  p8weight_10
> +        movrel          x5,  p16weight_10
> +        add             x2,  x3,  #8
> +        sub             x3,  x3,  #2
> +
> +        ld1             {v0.d}[0],  [x3]
> +        ld1             {v2.d}[0],  [x2],  x1
> +        ldcol.16        v0,  x3,  x1,  hi=1
> +        add             x3,  x3,  x1
> +        ldcol.16        v3,  x3,  x1,  4
> +
> +        uaddl           v7.4s,  v2.4h,  v3.4h
> +        rev64           v0.8h,  v0.8h
> +        trn1            v2.2d,  v2.2d,  v3.2d
> +
> +        usubl2          v3.4s,  v2.8h,  v0.8h
> +        usubl           v2.4s,  v2.4h,  v0.4h
> +
> +        ld1             {v6.4s},  [x4]
> +        mul             v2.4s,  v2.4s,  v6.4s
> +        mul             v3.4s,  v3.4s,  v6.4s
> +        ld1             {v0.4s}, [x5]
> +
> +        saddlp          v2.2d,  v2.4s
> +        saddlp          v3.2d,  v3.4s
> +        addp            v2.2d,  v2.2d,  v2.2d
> +        addp            v3.2d,  v3.2d,  v3.2d
> +        mov             v2.d[1], v3.d[0]
> +        shl             v3.2d,  v2.2d,  #4
> +        add             v2.2d,  v3.2d,  v2.2d
> +        rshrn           v5.2s,  v2.2d,  #5
> +        addp            v2.4s,  v5.4s,  v5.4s
> +        shl             v3.4s,  v2.4s,  #1
> +        add             v3.4s,  v3.4s,  v2.4s
> +
> +        rev64           v1.4s,  v7.4s
> +        trn1            v7.2d,  v1.2d,  v1.2d
> +        trn2            v7.2d,  v1.2d,  v7.2d
> +
> +
> +        add             v7.4s,  v7.4s,  v0.4s
> +        shl             v2.4s,  v7.4s,  #4
> +        sub             v2.4s,  v2.4s,  v3.4s
> +
> +        movrel          x5,  p16weight_10_new
> +        ld1             {v6.4s, v7.4s},  [x5]
> +
> +        mul             v6.4s,  v6.4s,  v5.s[0]
> +        mul             v7.4s,  v7.4s,  v5.s[0]
> +
> +        dup             v1.4s,  v2.s[0]
> +        dup             v2.4s,  v2.s[0]
> +        dup             v3.4s,  v5.s[1]
> +
> +        add             v1.4s,  v1.4s,  v6.4s
> +        add             v2.4s,  v2.4s,  v7.4s
> +
> +        mov             w3,  #8
> +        mov             w2,  #1023              // for clipping
> +        dup             v4.8h,  w2
> +1:
> +        sqshrun         v0.4h,  v1.4s,  #5
> +        sqshrun2        v0.8h,  v2.4s,  #5
> +
> +        subs            w3,  w3,  #1
> +
> +        add             v1.4s,  v1.4s,  v3.4s
> +        add             v2.4s,  v2.4s,  v3.4s
> +
> +        smin            v0.8h,  v0.8h,  v4.8h
> +        st1             {v0.8h},  [x0],  x1
> +        b.ne            1b
> +        ret
> +endfunc

Partially the same comments as for 16x16 above also applies to this 
function


> +
> +function ff_pred8x8_128_dc_neon_10, export=1
> +        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
> +        movi            v1.8h,  #2, lsl #8
> +        b               .L_pred8x8_dc_10_end
> +endfunc
> +
> +function ff_pred8x8_top_dc_neon_10, export=1
> +        sub             x2,  x0,  x1
> +        ld1             {v0.8h},  [x2]
> +
> +        uaddlp          v0.4s,  v0.8h

No need to go to 32 bit here; the same applies to most of the other 
functions below too.

// Martin



More information about the ffmpeg-devel mailing list