[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: h264, add chroma loop filters for 10bit
Lynne
dev at lynne.ee
Fri Jul 16 21:33:42 EEST 2021
16 Jul 2021, 19:56 by mnitenko at gmail.com:
> Benchmarks: A53 A72
> h264_h_loop_filter_chroma422_10bpp_c: 293.0 116.7
> h264_h_loop_filter_chroma422_10bpp_neon: 283.7 126.2
> h264_h_loop_filter_chroma_10bpp_c: 165.2 58.5
> h264_h_loop_filter_chroma_10bpp_neon: 74.7 87.2
> h264_h_loop_filter_chroma_intra422_10bpp_c: 246.2 124.5
> h264_h_loop_filter_chroma_intra422_10bpp_neon: 178.7 70.0
> h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 40.5
> h264_h_loop_filter_chroma_intra_10bpp_neon: 73.7 59.2
> h264_h_loop_filter_chroma_mbaff422_10bpp_c: 145.7 72.7
> h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 151.7 87.2
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.5 48.0
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 73.7 37.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 57.0 27.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 81.7 50.7
> h264_h_loop_filter_luma_intra_8bpp_c: 242.7 134.0
> h264_h_loop_filter_luma_intra_8bpp_neon: 100.7 53.5
> h264_v_loop_filter_chroma_10bpp_c: 257.2 138.5
> h264_v_loop_filter_chroma_10bpp_neon: 98.2 67.5
> h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 76.2
> h264_v_loop_filter_chroma_intra_10bpp_neon: 62.7 36.5
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
>
> this code is a bit slow, particularly the horizontal versions, so any
> suggestions are greatly appreciated!
>
> libavcodec/aarch64/h264dsp_init_aarch64.c | 29 +++
> libavcodec/aarch64/h264dsp_neon.S | 299 ++++++++++++++++++++++
> 2 files changed, 328 insertions(+)
>
> diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
> index d5baccf235..9ee9c11e15 100644
> --- a/libavcodec/aarch64/h264dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
> @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
> int16_t *block, int stride,
> const uint8_t nnzc[6*8]);
>
> +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
> + int beta, int8_t *tc0);
> +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
> + int alpha, int beta);
> +
> av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
> const int chroma_format_idc)
> {
> @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
> c->h264_idct8_add = ff_h264_idct8_add_neon;
> c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
> c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
> + } else if (have_neon(cpu_flags) && bit_depth == 10) {
> + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
> + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
> +
> + if (chroma_format_idc <= 1) {
> + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
> + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
> + } else {
> + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
> + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
> + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
> + }
> }
> }
> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index fbb8ecc463..92e5afa524 100644
> --- a/libavcodec/aarch64/h264dsp_neon.S
> +++ b/libavcodec/aarch64/h264dsp_neon.S
> @@ -827,3 +827,302 @@ endfunc
> weight_func 16
> weight_func 8
> weight_func 4
> +
> +.macro h264_loop_filter_start_10
> + cmp w2, #0
> + ldr w6, [x4]
> + ccmp w3, #0, #0, ne
> + lsl w2, w2, #2 // shift needed for 10bit
> + mov v24.S[0], w6
> + lsl w3, w3, #2
> + and w8, w6, w6, lsl #16
> + b.eq 1f
> + cmp w6, #0
> + b.eq 1f
> + ands w8, w8, w8, lsl #8
> + b.ge 2f
> +1:
> + ret
> +2:
> +.endm
> +
> +.macro h264_loop_filter_start_intra_10
> + orr w4, w2, w3
> + cbnz w4, 1f
> + ret
> +1:
> + sxtw x1, w1
> + lsl w2, w2, #2 // shift needed for 10bit
> + lsl w3, w3, #2 // shift needed for 10bit
> + dup v30.8h, w2 // alpha
> + dup v31.8h, w3 // beta
> +.endm
> +
> +.macro h264_loop_filter_chroma_10
> + dup v22.8h, w2 // alpha
> + dup v23.8h, w3 // beta
> + uxtl v24.8h, v24.8b // tc0
> +
> + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
> + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
> +
> + cmhi v26.8h, v22.8h, v26.8h // < alpha
> + cmhi v28.8h, v23.8h, v28.8h // < beta
> + cmhi v30.8h, v23.8h, v30.8h // < beta
> +
> + uxtl v4.4s, v0.4h
> + uxtl2 v5.4s, v0.8h
> +
> + and v26.16b, v26.16b, v28.16b
> +
> + usubw v4.4s, v4.4s, v16.4h
> + usubw2 v5.4s, v5.4s, v16.8h
> +
> + and v26.16b, v26.16b, v30.16b
> +
> + shl v4.4s, v4.4s, #2
> + shl v5.4s, v5.4s, #2
> +
> + mov x8, v26.d[0]
> + mov x9, v26.d[1]
> + orr x8, x8, x9
> +
> + sli v24.8H, v24.8H, #8
> + uxtl v24.8H, v24.8B
> + uaddw v4.4s, v4.4s, v18.4h
> + uaddw2 v5.4s, v5.4s, v18.8h // add p1
> +
> + cbz x8, 9f
> +
> + usubw v4.4s, v4.4s, v2.4h
> + usubw2 v5.4s, v5.4s, v2.8h // sub q1
> + rshrn v4.4h, v4.4s, #3
> + rshrn2 v4.8h, v5.4s, #3
> +
> + mov w8, #1
> + dup v31.8h, w8 // this is actually important for higher depths, but not needed in 8 bit
> + sub v24.8h, v24.8h, v31.8h
> + shl v24.8h, v24.8h, #2
> + add v24.8h, v24.8h, v31.8h
> + mov w8, #0
> + dup v31.8h, w8
> + smax v24.8h, v24.8h, v31.8h // this all feels like a huge hack (needed to exclude neg values)
> +
> + smin v4.8h, v4.8h, v24.8h
> + neg v25.8h, v24.8h
> + smax v4.8h, v4.8h, v25.8h
> +
> + uxtl v22.4s, v0.4h
> + uxtl2 v23.4s, v0.8h
> +
> + and v4.16B, v4.16B, v26.16B
> +
> + uxtl v28.4s, v16.4h
> + uxtl2 v29.4s, v16.8h
> +
> + saddw v28.4s, v28.4s, v4.4h
> + saddw2 v29.4s, v29.4s, v4.8h
> +
> + ssubw v22.4s, v22.4s, v4.4h
> + ssubw2 v23.4s, v23.4s, v4.8h
> +
> + sqxtun v16.4h, v28.4s
> + sqxtun2 v16.8h, v29.4s
> +
> + sqxtun v0.4h, v22.4s
> + sqxtun2 v0.8h, v23.4s
> +
> + mov w2, #1023 // for clipping
> + dup v4.8h, w2
> + smin v0.8h, v0.8h, v4.8h
> + smin v16.8h, v16.8h, v4.8h
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> + sxtw x1, w1
> +
> + sub x0, x0, x1, lsl #1
> + ld1 {v18.8h}, [x0], x1
> + ld1 {v16.8h}, [x0], x1
> + ld1 {v0.8h}, [x0], x1
> + ld1 {v2.8h}, [x0]
> +
> + h264_loop_filter_chroma_10
> +
> + sub x0, x0, x1, lsl #1
> + st1 {v16.8h}, [x0], x1
> + st1 {v0.8h}, [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> + sxtw x1, w1
> +
> + sub x0, x0, #4
> +h_loop_filter_chroma420_10:
> + ld1 {v18.d}[0], [x0], x1
> + ld1 {v16.d}[0], [x0], x1
> + ld1 {v0.d}[0], [x0], x1
> + ld1 {v2.d}[0], [x0], x1
> + ld1 {v18.d}[1], [x0], x1
> + ld1 {v16.d}[1], [x0], x1
> + ld1 {v0.d}[1], [x0], x1
> + ld1 {v2.d}[1], [x0], x1
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + h264_loop_filter_chroma_10
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + sub x0, x0, x1, lsl #3
> + st1 {v18.d}[0], [x0], x1
> + st1 {v16.d}[0], [x0], x1
> + st1 {v0.d}[0], [x0], x1
> + st1 {v2.d}[0], [x0], x1
> + st1 {v18.d}[1], [x0], x1
> + st1 {v16.d}[1], [x0], x1
> + st1 {v0.d}[1], [x0], x1
> + st1 {v2.d}[1], [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_neon_10, export=1
> + sxtw x1, w1
> + h264_loop_filter_start_10
> + add x5, x0, x1
> + sub x0, x0, #4
> + add x1, x1, x1
> + mov x7, x30
> + bl h_loop_filter_chroma420_10
> + mov x30, x7
> + sub x0, x5, #4
> + mov v24.s[0], w6
> + b h_loop_filter_chroma420_10
> +endfunc
> +
> +.macro h264_loop_filter_chroma_intra_10
> + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
> + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
> + cmhi v26.8h, v30.8h, v26.8h // < alpha
> + cmhi v27.8h, v31.8h, v27.8h // < beta
> + cmhi v28.8h, v31.8h, v28.8h // < beta
> + and v26.16b, v26.16b, v27.16b
> + and v26.16b, v26.16b, v28.16b
> + mov x2, v26.d[0]
> + mov x3, v26.d[1]
> + orr x2, x2, x3
> +
> + ushll v4.4s, v18.4h, #1
> + ushll2 v5.4s, v18.8h, #1
> + ushll v6.4s, v19.4h, #1
> + ushll2 v7.4s, v19.8h, #1
> +
> + cbz x2, 9f
> +
> + uaddl v20.4s, v16.4h, v19.4h
> + uaddl2 v21.4s, v16.8h, v19.8h
> + uaddl v22.4s, v17.4h, v18.4h
> + uaddl2 v23.4s, v17.8h, v18.8h
> + add v20.4s, v20.4s, v4.4s
> + add v21.4s, v21.4s, v5.4s
> + add v22.4s, v22.4s, v6.4s
> + add v23.4s, v23.4s, v7.4s
> + uqrshrn v24.4h, v20.4s, #2
> + uqrshrn2 v24.8h, v21.4s, #2
> + uqrshrn v25.4h, v22.4s, #2
> + uqrshrn2 v25.8h, v23.4s, #2
> + bit v16.16b, v24.16b, v26.16b
> + bit v17.16b, v25.16b, v26.16b
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> + sub x0, x0, x1, lsl #1
> + ld1 {v18.8h}, [x0], x1
> + ld1 {v16.8h}, [x0], x1
> + ld1 {v17.8h}, [x0], x1
> + ld1 {v19.8h}, [x0]
> +
> + h264_loop_filter_chroma_intra_10
> +
> + sub x0, x0, x1, lsl #1
> + st1 {v16.8h}, [x0], x1
> + st1 {v17.8h}, [x0], x1
> +
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> +
> + sub x4, x0, #4
> + sub x0, x0, #2
> + ld1 {v18.8h}, [x4], x1
> + ld1 {v16.8h}, [x4], x1
> + ld1 {v17.8h}, [x4], x1
> + ld1 {v19.8h}, [x4], x1
> +
> + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
> +
> + h264_loop_filter_chroma_intra_10
> +
> + st2 {v16.h,v17.h}[0], [x0], x1
> + st2 {v16.h,v17.h}[1], [x0], x1
> + st2 {v16.h,v17.h}[2], [x0], x1
> + st2 {v16.h,v17.h}[3], [x0], x1
> +
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> +
> + sub x4, x0, #4
> + sub x0, x0, #2
> +h_loop_filter_chroma420_intra_10:
> + ld1 {v18.8h}, [x4], x1
> + ld1 {v16.8h}, [x4], x1
> + ld1 {v17.8h}, [x4], x1
> + ld1 {v19.8h}, [x4], x1
> + ld1 {v18.d}[1], [x4], x1
> + ld1 {v16.d}[1], [x4], x1
> + ld1 {v17.d}[1], [x4], x1
> + ld1 {v19.d}[1], [x4], x1
> +
> + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
> +
> + h264_loop_filter_chroma_intra_10
> +
> + st2 {v16.h,v17.h}[0], [x0], x1
> + st2 {v16.h,v17.h}[1], [x0], x1
> + st2 {v16.h,v17.h}[2], [x0], x1
> + st2 {v16.h,v17.h}[3], [x0], x1
> + st2 {v16.h,v17.h}[4], [x0], x1
> + st2 {v16.h,v17.h}[5], [x0], x1
> + st2 {v16.h,v17.h}[6], [x0], x1
> + st2 {v16.h,v17.h}[7], [x0], x1
> +
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> + sub x4, x0, #4
> + add x5, x0, x1, lsl #3
> + sub x0, x0, #2
> + mov x7, x30
> + bl h_loop_filter_chroma420_intra_10
> + sub x0, x5, #2
> + mov x30, x7
> + b h_loop_filter_chroma420_intra_10
> +endfunc
>
I think you have enough spare registers to unroll h_loop_filter_chroma420_intra_10
twice. That should speed up ff_h264_h_loop_filter_chroma422_intra_neon_10 by
a lot, since with aarch64 an unused register is a wasted register.
More information about the ffmpeg-devel
mailing list