[FFmpeg-devel] [PATCH v2] lavc/aarch64: add pred functions for 10-bit
Martin Storsjö
martin at martin.st
Tue Aug 17 14:01:55 EEST 2021
On Mon, 16 Aug 2021, Mikhail Nitenko wrote:
> Benchmarks: A53 A72
> pred8x8_dc_10_c: 64.2 55.7
> pred8x8_dc_10_neon: 61.7 53.7
> pred8x8_dc_128_10_c: 26.0 20.7
> pred8x8_dc_128_10_neon: 30.7 24.5
> pred8x8_horizontal_10_c: 60.0 35.2
> pred8x8_horizontal_10_neon: 38.0 33.0
> pred8x8_left_dc_10_c: 42.5 35.5
> pred8x8_left_dc_10_neon: 50.7 41.5
> pred8x8_mad_cow_dc_0l0_10_c: 55.7 44.7
> pred8x8_mad_cow_dc_0l0_10_neon: 47.5 37.2
> pred8x8_mad_cow_dc_0lt_10_c: 89.2 75.5
> pred8x8_mad_cow_dc_0lt_10_neon: 52.2 47.0
> pred8x8_mad_cow_dc_l0t_10_c: 74.7 59.2
> pred8x8_mad_cow_dc_l0t_10_neon: 50.5 44.7
> pred8x8_mad_cow_dc_l00_10_c: 58.0 45.7
> pred8x8_mad_cow_dc_l00_10_neon: 42.5 37.5
> pred8x8_plane_10_c: 347.7 295.5
> pred8x8_plane_10_neon: 136.2 108.0
> pred8x8_top_dc_10_c: 44.5 38.5
> pred8x8_top_dc_10_neon: 39.7 34.5
> pred8x8_vertical_10_c: 27.5 21.7
> pred8x8_vertical_10_neon: 21.0 22.2
> pred16x16_plane_10_c: 1242.0 1075.7
> pred16x16_plane_10_neon: 324.0 199.5
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
>
> moved to 32-bit, however, in plane the 16bit are not enough, and it
> overflows, so when it overflows the code starts using 32bit wide
> sections
>
> libavcodec/aarch64/h264pred_init.c | 40 +++-
> libavcodec/aarch64/h264pred_neon.S | 302 ++++++++++++++++++++++++++++-
> 2 files changed, 335 insertions(+), 7 deletions(-)
>
> diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
> index 325a86bfcd..0ae8f70d23 100644
> --- a/libavcodec/aarch64/h264pred_init.c
> +++ b/libavcodec/aarch64/h264pred_init.c
> @@ -45,10 +45,23 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
>
> -void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> -void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> -void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
> void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +
> +void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride);
> +void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride);
>
> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
> const int bit_depth,
> @@ -84,10 +97,31 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
> h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
> }
> if (bit_depth == 10) {
> + if (chroma_format_idc <= 1) {
> + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon_10;
> + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon_10;
> + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10;
> + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon_10;
> + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
> + codec_id != AV_CODEC_ID_VP8) {
> + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon_10;
> + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10;
> + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10;
> + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10;
> + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10;
> + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10;
> + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10;
> + }
> + }
> +
> h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10;
> h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10;
> h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10;
> h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10;
> + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
> + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
> + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon_10;
> }
> }
>
> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index e40bdc8d53..712741941f 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -361,15 +361,13 @@ function ff_pred8x8_0l0_dc_neon, export=1
> endfunc
>
> .macro ldcol.16 rd, rs, rt, n=4, hi=0
> -.if \n >= 4 || \hi == 0
> +.if \n >= 4 && \hi == 0
> ld1 {\rd\().h}[0], [\rs], \rt
> ld1 {\rd\().h}[1], [\rs], \rt
> -.endif
> -.if \n >= 4 || \hi == 1
> ld1 {\rd\().h}[2], [\rs], \rt
> ld1 {\rd\().h}[3], [\rs], \rt
> .endif
> -.if \n == 8
> +.if \n == 8 || \hi == 1
> ld1 {\rd\().h}[4], [\rs], \rt
> ld1 {\rd\().h}[5], [\rs], \rt
> ld1 {\rd\().h}[6], [\rs], \rt
> @@ -467,3 +465,299 @@ function ff_pred16x16_vert_neon_10, export=1
> b.ne 1b
> ret
> endfunc
> +
> +function ff_pred16x16_plane_neon_10, export=1
> + sub x3, x0, x1
> + movrel x4, p16weight
> + add x2, x3, #16
> + sub x3, x3, #2
> + ld1 {v0.8h}, [x3]
> + ld1 {v2.8h}, [x2], x1
> + ldcol.16 v1, x3, x1, 8
> + add x3, x3, x1
> + ldcol.16 v3, x3, x1, 8
> +
> + rev64 v16.8h, v0.8h
> + rev64 v17.8h, v1.8h
> + ext v0.16b, v16.16b, v16.16b, #8
> + ext v1.16b, v17.16b, v17.16b, #8
> +
> + add v7.8h, v2.8h, v3.8h
> + sub v2.8h, v2.8h, v0.8h
> + sub v3.8h, v3.8h, v1.8h
> + ld1 {v0.8h}, [x4]
> + mul v2.8h, v2.8h, v0.8h
> + mul v3.8h, v3.8h, v0.8h
> + addp v2.8h, v2.8h, v3.8h
> + addp v2.8h, v2.8h, v2.8h
> + addp v2.4h, v2.4h, v2.4h
> + sshll v3.4s, v2.4h, #2
> + saddw v2.4s, v3.4s, v2.4h
> + rshrn v4.4h, v2.4s, #6
> + trn2 v5.4h, v4.4h, v4.4h
> + add v2.4h, v4.4h, v5.4h
> + shl v3.4h, v2.4h, #3
> + ext v7.16b, v7.16b, v7.16b, #14
> + sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
> + add v7.4h, v7.4h, v0.4h
> + shl v2.4h, v7.4h, #4
> + ssubl v2.4s, v2.4h, v3.4h
> + shl v3.4h, v4.4h, #4
> + ext v0.16b, v0.16b, v0.16b, #14
> + ssubl v6.4s, v5.4h, v3.4h
> +
> + mov v0.h[0], wzr
> + mul v0.8h, v0.8h, v4.h[0]
> + dup v16.4s, v2.s[0]
> + dup v17.4s, v2.s[0]
> + dup v2.8h, v4.h[0]
> + dup v3.4s, v6.s[0]
> + shl v2.8h, v2.8h, #3
> + saddw v16.4s, v16.4s, v0.4h
> + saddw2 v17.4s, v17.4s, v0.8h
> + saddw v3.4s, v3.4s, v2.4h
Nit: Pleasey try to fix the wobbly vertical alignment here
> +
> + mov w3, #16
> + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
> +1:
> + sqshrun v0.4h, v16.4s, #5
> + sqshrun2 v0.8h, v17.4s, #5
> + saddw v16.4s, v16.4s, v2.4h
> + saddw v17.4s, v17.4s, v2.4h
> + sqshrun v1.4h, v16.4s, #5
> + sqshrun2 v1.8h, v17.4s, #5
> + add v16.4s, v16.4s, v3.4s
> + add v17.4s, v17.4s, v3.4s
> +
> + subs w3, w3, #1
> +
> + smin v0.8h, v0.8h, v4.8h
> + smin v1.8h, v1.8h, v4.8h
> + st1 {v0.8h, v1.8h}, [x0], x1
I think it might be better to do the 'subs' between 'smin' and 'st1'.
I haven't verified if it's possible to do things with more narrow
registers, but I guess this seems reasonable
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_pred8x8_hor_neon_10, export=1
> + sub x2, x0, #2
> + mov w3, #8
> +
> +1: ld1r {v0.8h}, [x2], x1
> + subs w3, w3, #1
> + st1 {v0.8h}, [x0], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_pred8x8_vert_neon_10, export=1
> + sub x2, x0, x1
> + lsl x1, x1, #1
> +
> + ld1 {v0.8h}, [x2], x1
> + mov w3, #4
> +1: subs w3, w3, #1
> + st1 {v0.8h}, [x0], x1
> + st1 {v0.8h}, [x2], x1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_pred8x8_plane_neon_10, export=1
> + sub x3, x0, x1
> + movrel x4, p8weight
> + movrel x5, p16weight
> + add x2, x3, #8
> + sub x3, x3, #2
> + ld1 {v0.d}[0], [x3]
> + ld1 {v2.d}[0], [x2], x1
> + ldcol.16 v0, x3, x1, hi=1
> + add x3, x3, x1
> + ldcol.16 v3, x3, x1, 4
> + add v7.8h, v2.8h, v3.8h
> + rev64 v0.8h, v0.8h
> + trn1 v2.2d, v2.2d, v3.2d
> + sub v2.8h, v2.8h, v0.8h
> + ld1 {v6.8h}, [x4]
> + mul v2.8h, v2.8h, v6.8h
> + ld1 {v0.8h}, [x5]
> + saddlp v2.4s, v2.8h
> + addp v2.4s, v2.4s, v2.4s
> + shl v3.4s, v2.4s, #4
> + add v2.4s, v3.4s, v2.4s
> + rshrn v5.4h, v2.4s, #5
> + addp v2.4h, v5.4h, v5.4h
> + shl v3.4h, v2.4h, #1
> + add v3.4h, v3.4h, v2.4h
> + rev64 v7.4h, v7.4h
> + add v7.4h, v7.4h, v0.4h
> + shl v2.4h, v7.4h, #4
> + ssubl v2.4s, v2.4h, v3.4h
> + ext v0.16b, v0.16b, v0.16b, #14
> + mov v0.h[0], wzr
> + mul v0.8h, v0.8h, v5.h[0]
> + dup v1.4s, v2.s[0]
> + dup v2.4s, v2.s[0]
> + dup v3.8h, v5.h[1]
> + saddw v1.4s, v1.4s, v0.4h
> + saddw2 v2.4s, v2.4s, v0.8h
> + mov w3, #8
> + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
> +1:
> + sqshrun v0.4h, v1.4s, #5
> + sqshrun2 v0.8h, v2.4s, #5
> +
> + subs w3, w3, #1
> +
> + saddw v1.4s, v1.4s, v3.4h
> + saddw v2.4s, v2.4s, v3.4h
> +
> + smin v0.8h, v0.8h, v4.8h
> + st1 {v0.8h}, [x0], x1
I think it might be good to do the 'smin' a bit earlier here, maybe
between the two 'saddw' or after the 'subs'.
Looks good other than that.
// Martin
More information about the ffmpeg-devel
mailing list