[FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions
Martin Storsjö
martin at martin.st
Thu Feb 11 11:02:12 EET 2021
On Thu, 4 Feb 2021, Josh Dekker wrote:
> From: Reimar Döffinger <Reimar.Doeffinger at gmx.de>
>
> Speedup is fairly small, around 1.5%, but these are fairly simple.
>
> Signed-off-by: Josh Dekker <josh at itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 190 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +++
> 2 files changed, 214 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index c70d6a906d..329038a958 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -36,6 +36,196 @@ const trans, align=4
> .short 31, 22, 13, 4
> endconst
>
> +.macro clip10 in1, in2, c1, c2
> + smax \in1, \in1, \c1
> + smax \in2, \in2, \c1
> + smin \in1, \in1, \c2
> + smin \in2, \in2, \c2
> +.endm
> +
> +function ff_hevc_add_residual_4x4_8_neon, export=1
> + ld1 {v0.8h-v1.8h}, [x1]
> + ld1 {v2.s}[0], [x0], x2
> + ld1 {v2.s}[1], [x0], x2
> + ld1 {v2.s}[2], [x0], x2
> + ld1 {v2.s}[3], [x0], x2
> + sub x0, x0, x2, lsl #2
> + uxtl v6.8h, v2.8B
> + uxtl2 v7.8h, v2.16B
Personal preference: I prefer the non-shouty forms like v2.16b instead of
v2.16B.
> + sqadd v0.8h, v0.8h, v6.8h
> + sqadd v1.8h, v1.8h, v7.8h
Nit: Incosistent alignment between columns 1-2 and 2-3. (And if one would
want to make space for full sized operands like v16.16b, they'd all need
another space.)
> + sqxtun v0.8B, v0.8h
> + sqxtun2 v0.16B, v1.8h
> + st1 {v0.s}[0], [x0], x2
> + st1 {v0.s}[1], [x0], x2
> + st1 {v0.s}[2], [x0], x2
> + st1 {v0.s}[3], [x0], x2
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_4x4_10_neon, export=1
> + mov x12, x0
> + ld1 {v0.8h-v1.8h}, [x1]
> + ld1 {v2.d}[0], [x12], x2
> + ld1 {v2.d}[1], [x12], x2
> + ld1 {v3.d}[0], [x12], x2
> + sqadd v0.8h, v0.8h, v2.8h
> + ld1 {V3.d}[1], [x12], x2
> + movi v4.8h, #0
> + sqadd v1.8h, v1.8h, v3.8h
> + mvni v5.8h, #0xFC, LSL #8 // movi #0x3FF
> + clip10 v0.8h, v1.8h, v4.8h, v5.8h
> + st1 {v0.d}[0], [x0], x2
> + st1 {v0.d}[1], [x0], x2
> + st1 {v1.d}[0], [x0], x2
> + st1 {v1.d}[1], [x0], x2
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_8_neon, export=1
> + add x12, x0, x2
> + add x2, x2, x2
> + mov x3, #8
> +1: subs x3, x3, #2
Nit: Odd vertical alignment here?
> + ld1 {v2.d}[0], [x0]
> + ld1 {v2.d}[1], [x12]
> + uxtl v3.8h, v2.8B
> + ld1 {v0.8h-v1.8h}, [x1], #32
> + uxtl2 v2.8h, v2.16B
> + sqadd v0.8h, v0.8h, v3.8h
> + sqadd v1.8h, v1.8h, v2.8h
> + sqxtun v0.8B, v0.8h
> + sqxtun2 v0.16B, v1.8h
> + st1 {v0.d}[0], [x0], x2
> + st1 {v0.d}[1], [x12], x2
> + bne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_10_neon, export=1
> + add x12, x0, x2
> + add x2, x2, x2
> + mov x3, #8
> + movi v4.8h, #0
> + mvni v5.8h, #0xFC, LSL #8 // movi #0x3FF
> +1: subs x3, x3, #2
> + ld1 {v0.8h-v1.8h}, [x1], #32
> + ld1 {v2.8h}, [x0]
> + sqadd v0.8h, v0.8h, v2.8h
> + ld1 {v3.8h}, [x12]
> + sqadd v1.8h, v1.8h, v3.8h
> + clip10 v0.8h, v1.8h, v4.8h, v5.8h
> + st1 {v0.8h}, [x0], x2
> + st1 {v1.8h}, [x12], x2
> + bne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_8_neon, export=1
> + mov x3, #16
> + add x12, x0, x2
> + add x2, x2, x2
> +1: subs x3, x3, #2
> + ld1 {v16.16B}, [x0]
> + ld1 {v0.8h-v3.8h}, [x1], #64
> + ld1 {v19.16B}, [x12]
> + uxtl v17.8h, v16.8B
> + uxtl2 v18.8h, v16.16B
> + uxtl v20.8h, v19.8B
> + uxtl2 v21.8h, v19.16B
> + sqadd v0.8h, v0.8h, v17.8h
> + sqadd v1.8h, v1.8h, v18.8h
> + sqadd v2.8h, v2.8h, v20.8h
> + sqadd v3.8h, v3.8h, v21.8h
> + sqxtun v0.8B, v0.8h
> + sqxtun2 v0.16B, v1.8h
> + sqxtun v1.8B, v2.8h
> + sqxtun2 v1.16B, v3.8h
> + st1 {v0.16B}, [x0], x2
> + st1 {v1.16B}, [x12], x2
> + bne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_10_neon, export=1
> + mov x3, #16
> + movi v20.8h, #0
> + mvni v21.8h, #0xFC, LSL #8 // movi #0x3FF
> + add x12, x0, x2
> + add x2, x2, x2
> +1: subs x3, x3, #2
> + ld1 {v16.8h-v17.8h}, [x0]
> + ld1 {v0.8h-v3.8h}, [x1], #64
> + sqadd v0.8h, v0.8h, v16.8h
> + ld1 {v18.8h-v19.8h}, [x12]
> + sqadd v1.8h, v1.8h, v17.8h
> + sqadd v2.8h, v2.8h, v18.8h
> + sqadd v3.8h, v3.8h, v19.8h
> + clip10 v0.8h, v1.8h, v20.8h, v21.8h
> + clip10 v2.8h, v3.8h, v20.8h, v21.8h
> + st1 {v0.8h-v1.8h}, [x0], x2
> + st1 {v2.8h-v3.8h}, [x12], x2
> + bne 1b
> + ret
> +endfunc
> +
> +function ff_hevc_add_residual_32x32_8_neon, export=1
> + add x12, x0, x2
> + add x2, x2, x2
> + mov x3, #32
> +1: subs x3, x3, #2
> + ld1 {v20.16B, v21.16B}, [x0]
> + uxtl v16.8h, v20.8B
> + uxtl2 v17.8h, v20.16B
> + ld1 {v22.16B, v23.16B}, [x12]
> + uxtl v18.8h, v21.8B
> + uxtl2 v19.8h, v21.16B
> + uxtl v20.8h, v22.8B
> + ld1 {v0.8h-v3.8h}, [x1], #64
> + ld1 {v4.8h-v7.8h}, [x1], #64
> + uxtl2 v21.8h, v22.16B
> + uxtl v22.8h, v23.8B
> + uxtl2 v23.8h, v23.16B
> + sqadd v0.8h, v0.8h, v16.8h
> + sqadd v1.8h, v1.8h, v17.8h
Here, the vertical alignment is visibly inconsistent across instructions
where they could line up better.
Other than that, I've got nothing to complain about functionally, and it
gives a very good speedup (3-14x depending on block size and core type).
// Martin
More information about the ffmpeg-devel
mailing list