[FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions

Martin Storsjö martin at martin.st
Thu Feb 11 11:02:12 EET 2021


On Thu, 4 Feb 2021, Josh Dekker wrote:

> From: Reimar Döffinger <Reimar.Doeffinger at gmx.de>
>
> Speedup is fairly small, around 1.5%, but these are fairly simple.
>
> Signed-off-by: Josh Dekker <josh at itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 190 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
> 2 files changed, 214 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index c70d6a906d..329038a958 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -36,6 +36,196 @@ const trans, align=4
>         .short 31, 22, 13, 4
> endconst
> 
> +.macro clip10 in1, in2, c1, c2
> +        smax        \in1, \in1, \c1
> +        smax        \in2, \in2, \c1
> +        smin        \in1, \in1, \c2
> +        smin        \in2, \in2, \c2
> +.endm
> +
> +function ff_hevc_add_residual_4x4_8_neon, export=1
> +        ld1             {v0.8h-v1.8h}, [x1]
> +        ld1             {v2.s}[0], [x0], x2
> +        ld1             {v2.s}[1], [x0], x2
> +        ld1             {v2.s}[2], [x0], x2
> +        ld1             {v2.s}[3], [x0], x2
> +        sub             x0, x0, x2, lsl #2
> +        uxtl            v6.8h,  v2.8B
> +        uxtl2           v7.8h,  v2.16B

Personal preference: I prefer the non-shouty forms like v2.16b instead of 
v2.16B.

> +        sqadd           v0.8h,  v0.8h, v6.8h
> +        sqadd           v1.8h,  v1.8h, v7.8h

Nit: Incosistent alignment between columns 1-2 and 2-3. (And if one would 
want to make space for full sized operands like v16.16b, they'd all need 
another space.)

> +        sqxtun          v0.8B,  v0.8h
> +        sqxtun2         v0.16B, v1.8h
> +        st1             {v0.s}[0], [x0], x2
> +        st1             {v0.s}[1], [x0], x2
> +        st1             {v0.s}[2], [x0], x2
> +        st1             {v0.s}[3], [x0], x2
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_4x4_10_neon, export=1
> +        mov             x12, x0
> +        ld1             {v0.8h-v1.8h}, [x1]
> +        ld1             {v2.d}[0], [x12], x2
> +        ld1             {v2.d}[1], [x12], x2
> +        ld1             {v3.d}[0], [x12], x2
> +        sqadd           v0.8h, v0.8h, v2.8h
> +        ld1             {V3.d}[1], [x12], x2
> +        movi            v4.8h, #0
> +        sqadd           v1.8h, v1.8h, v3.8h
> +        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
> +        clip10          v0.8h, v1.8h, v4.8h, v5.8h
> +        st1             {v0.d}[0], [x0], x2
> +        st1             {v0.d}[1], [x0], x2
> +        st1             {v1.d}[0], [x0], x2
> +        st1             {v1.d}[1], [x0], x2
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_8_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,   #8
> +1:      subs            x3,   x3, #2

Nit: Odd vertical alignment here?

> +        ld1             {v2.d}[0],   [x0]
> +        ld1             {v2.d}[1],   [x12]
> +        uxtl            v3.8h,   v2.8B
> +        ld1             {v0.8h-v1.8h}, [x1], #32
> +        uxtl2           v2.8h,   v2.16B
> +        sqadd           v0.8h,   v0.8h,   v3.8h
> +        sqadd           v1.8h,   v1.8h,   v2.8h
> +        sqxtun          v0.8B,   v0.8h
> +        sqxtun2         v0.16B,  v1.8h
> +        st1             {v0.d}[0],   [x0], x2
> +        st1             {v0.d}[1],   [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_10_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,  #8
> +        movi            v4.8h, #0
> +        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
> +1:      subs            x3,  x3, #2
> +        ld1             {v0.8h-v1.8h}, [x1], #32
> +        ld1             {v2.8h},    [x0]
> +        sqadd           v0.8h, v0.8h, v2.8h
> +        ld1             {v3.8h},    [x12]
> +        sqadd           v1.8h, v1.8h, v3.8h
> +        clip10          v0.8h, v1.8h, v4.8h, v5.8h
> +        st1             {v0.8h}, [x0], x2
> +        st1             {v1.8h}, [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_8_neon, export=1
> +        mov             x3,  #16
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +1:      subs            x3,  x3, #2
> +        ld1             {v16.16B},     [x0]
> +        ld1             {v0.8h-v3.8h}, [x1], #64
> +        ld1             {v19.16B},    [x12]
> +        uxtl            v17.8h, v16.8B
> +        uxtl2           v18.8h, v16.16B
> +        uxtl            v20.8h, v19.8B
> +        uxtl2           v21.8h, v19.16B
> +        sqadd           v0.8h,  v0.8h, v17.8h
> +        sqadd           v1.8h,  v1.8h, v18.8h
> +        sqadd           v2.8h,  v2.8h, v20.8h
> +        sqadd           v3.8h,  v3.8h, v21.8h
> +        sqxtun          v0.8B,  v0.8h
> +        sqxtun2         v0.16B, v1.8h
> +        sqxtun          v1.8B,  v2.8h
> +        sqxtun2         v1.16B, v3.8h
> +        st1             {v0.16B},     [x0], x2
> +        st1             {v1.16B},     [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_10_neon, export=1
> +        mov             x3,  #16
> +        movi            v20.8h, #0
> +        mvni            v21.8h, #0xFC, LSL #8 // movi #0x3FF
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +1:      subs            x3,  x3, #2
> +        ld1             {v16.8h-v17.8h}, [x0]
> +        ld1             {v0.8h-v3.8h},  [x1], #64
> +        sqadd           v0.8h, v0.8h, v16.8h
> +        ld1             {v18.8h-v19.8h}, [x12]
> +        sqadd           v1.8h, v1.8h, v17.8h
> +        sqadd           v2.8h, v2.8h, v18.8h
> +        sqadd           v3.8h, v3.8h, v19.8h
> +        clip10          v0.8h, v1.8h, v20.8h, v21.8h
> +        clip10          v2.8h, v3.8h, v20.8h, v21.8h
> +        st1             {v0.8h-v1.8h},   [x0], x2
> +        st1             {v2.8h-v3.8h},   [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_32x32_8_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,  #32
> +1:      subs            x3,  x3, #2
> +        ld1             {v20.16B, v21.16B}, [x0]
> +        uxtl            v16.8h,  v20.8B
> +        uxtl2           v17.8h,  v20.16B
> +        ld1             {v22.16B, v23.16B}, [x12]
> +        uxtl            v18.8h,  v21.8B
> +        uxtl2           v19.8h,  v21.16B
> +        uxtl            v20.8h,  v22.8B
> +        ld1             {v0.8h-v3.8h}, [x1], #64
> +        ld1             {v4.8h-v7.8h}, [x1], #64
> +        uxtl2           v21.8h,  v22.16B
> +        uxtl            v22.8h,  v23.8B
> +        uxtl2           v23.8h,  v23.16B
> +        sqadd           v0.8h, v0.8h,  v16.8h
> +        sqadd           v1.8h, v1.8h,  v17.8h

Here, the vertical alignment is visibly inconsistent across instructions 
where they could line up better.


Other than that, I've got nothing to complain about functionally, and it 
gives a very good speedup (3-14x depending on block size and core type).

// Martin


More information about the ffmpeg-devel mailing list