[FFmpeg-devel] [PATCH] lavc/aarch64: add some neon pix_abs functions

Tue Mar 15 00:28:18 EET 2022

On Mon, 7 Mar 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
>
> ff_pix_abs16_neon:
> c:  benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
>
> ff_pix_abs16_xy2_neon:
> c:  benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds

I see that there's no checkasm tests for these functions - would you mind 
adding one? (There's something kind of like a checkasm test in 
libavcodec/tests/motion.c, but that one doesn't seem to be updated for 
contempory SIMD instruction sets.)

Adding a checkasm test is important as it tests for a bunch of aspects 
that otherwise can seem to work by accident (like missing zeroing/sign 
extension of the upper half of registers, clobbering callee saved 
registers, etc). For functions of this size, it's not hard to verify such 
aspects of course, but I pretty much want to have checkasm coverage for 
all newly added assembly. (Plus that checkasm gives built-in benchmarking 
support for the functions.)

> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> new file mode 100644
> index 0000000000..fb827daaf5
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -0,0 +1,39 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/mpegvideo.h"
> +
> +int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> +                       ptrdiff_t stride, int h);
> +int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
> +                       ptrdiff_t stride, int h);

The second line seems misindented for both functions

> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> new file mode 100644
> index 0000000000..85b0e4bd9e
> --- /dev/null
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -0,0 +1,219 @@
> +/*
> + * Copyright (c) 2022 Jonathan Swinney <jswinney at amazon.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +function ff_pix_abs16_neon, export=1

Nit: Empty line between the include and the function

> +        // x0   unused
> +        // x1   uint8_t *pix1
> +        // x2   uint8_t *pix2
> +        // x3   ptrdiff_t stride
> +        // w4   int h
> +        // x5   uint8_t *pix3
> +        cmp     w4, #4                      // if h < 4, jump to completion section

Please indent the assembly according to the existing code; 8 spaces before 
the instruction column, 24 chars before the first operand.

> +        b.lt    2f
> +        movi    v18.4S, #0                  // clear result accumulator

Nit: I prefer lower case for the element specifier (.4s)

> +1:
> +        movi    v16.8H, #0                  // clear uabal accumulator
> +        ld1     {v0.16B}, [x1], x3          // load pix1
> +        ld1     {v4.16B}, [x2], x3          // load pix2
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +        ld1     {v5.16B}, [x2], x3          // load pix2
> +        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v0.16B, v4.16B
> +        ld1     {v2.16B}, [x1], x3          // load pix1
> +        ld1     {v6.16B}, [x2], x3          // load pix2
> +        uabal   v16.8H, v1.8B, v5.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v1.16B, v5.16B
> +        ld1     {v3.16B}, [x1], x3
> +        ld1     {v7.16B}, [x2], x3
> +        uabal   v16.8H, v2.8B, v6.8B
> +        uabal2  v16.8H, v2.16B, v6.16B
> +        sub     w4, w4, #4                  // h -= 4
> +        uabal   v16.8H, v3.8B, v7.8B
> +        uabal2  v16.8H, v3.16B, v7.16B
> +        cmp     w4, #4                      // if h >= 4, loop
> +        addv    h17, v16.8H                 // add up everything in v16 accumulator
> +        add     d18, d17, d18               // add to the end result register

Would it be beneficial to not do the addv here on each iteration, but 
accumulate in v18.8h, and the just do one single addv at the end?

> +
> +        b.ge    1b
> +        cbnz    w4, 2f                      // if iterations remain, jump to completion section
> +
> +        mov     w0, v18.S[0]                // copy result to general purpose register
> +        ret
> +
> +2:
> +        movi    v16.8H, #0                  // clear the uabal accumulator
> +        ld1     {v0.16B}, [x1]              // load pix1
> +        ld1     {v4.16B}, [x2]              // load pix2
> +        add     x1, x1, x3                  // increment pointers
> +        add     x2, x2, x3

Why not using the regular postincrement on the ld1, like above?

> +        uabal   v16.8H, v0.8B, v4.8B        // absolute difference accumulate
> +        uabal2  v16.8H, v0.16B, v4.16B
> +        addv    h17, v16.8H                 // add up v16
> +        add     d18, d17, d18               // add to result

If we got here via the "b.lt 2f" at the start of the function, d18 is 
uninitialized here. (A proper checkasm test would test heights < 4.)

> +        subs    w4, w4, #1                  // h -= 1
> +        b.ne    2b
> +
> +        mov     w0, v18.S[0]                // copy result to general purpose register
> +        ret
> +endfunc
> +
> +function ff_pix_abs16_xy2_neon, export=1
> +        // x0   unused
> +        // x1   uint8_t *pix1
> +        // x2   uint8_t *pix2
> +        // x3   ptrdiff_t stride
> +        // w4   int h
> +        // x5   uint8_t *pix3
> +        add     x5, x2, x3                  // create a pointer for pix3
> +        movi    v0.2D, #0                   // initialize the result register
> +
> +        // I also tested these intructions to get pix2+1 from pix2, but it wasn't faster
> +        // than just doing another full (unaligned) load.
> +        // ldr     b21, [x5, #16]
> +        // ushr    v4.2D, v2.2D, #8
> +        // mov     v4.16B[15], v21.16B[0]
> +        // mov     v4.16B[7], v2.16B[8]

This version most certainly would be slower indeed. If we could be ok with 
doing a bit of overread, the simplest version might be to load e.g. "ld1 
{v4.16b, v5.16b}, [x2]" followed by "ext v5.16b, v4.16b, v5.16b, #1" to 
shift it. But doing an overlapping unaligned load probably is fine too.

> +
> +        // Load initial pix2 values for either the unrolled version of completion version.
> +        ldr     q4, [x2, #1]                // load pix2+1
> +        ldr     q3, [x2]                    // load pix2
> +        uaddl   v2.8H, v3.8B, v4.8B         // pix2 + pix2+1 0..7
> +        uaddl2  v4.8H, v3.16B, v4.16B       // pix2 + pix2+1 8..15
> +        cmp     w4, #4                      // if h < 4 jump to the completion version
> +        b.lt    2f
> +1:
> +        // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
> +        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
> +        // plus two at the begining to start.
> +        ldr     q5, [x5, #1]                // load pix3+1
> +        ld1     {v3.16B}, [x5], x3          // load pix3
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +
> +        ldr     q16, [x5, #1]               // load pix3+1
> +        ld1     {v7.16B}, [x5], x3          // load pix3
> +        ld1     {v6.16B}, [x1], x3          // load pix1
> +
> +        ldr     q19, [x5, #1]               // load pix3+1
> +        ld1     {v18.16B}, [x5], x3         // load pix3
> +        ld1     {v17.16B}, [x1], x3         // load pix1
> +
> +        ldr     q22, [x5, #1]               // load pix3+1
> +        ld1     {v21.16B}, [x5], x3         // load pix3
> +        ld1     {v20.16B}, [x1], x3         // load pix1
> +
> +        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
> +        uaddl   v30.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v31.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
> +        add     v23.8H, v2.8H, v30.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add     v24.8H, v4.8H, v31.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        urshr   v23.8H, v23.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v24.8H, v24.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v2.8H, v7.8B, v16.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v4.8H, v7.16B, v16.16B      // pix3 + pix3+1 8..15
> +        add     v26.8H, v30.8H, v2.8H       // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v27.8H, v31.8H, v4.8H       // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v26.8H, v26.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v27.8H, v27.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v3.8H, v18.8B, v19.8B       // pix3 + pix3+1 0..7
> +        uaddl2  v5.8H, v18.16B, v19.16B     // pix3 + pix3+1 8..15
> +        add     v28.8H, v2.8H, v3.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v29.8H, v4.8H, v5.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v28.8H, v28.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v29.8H, v29.8H, #2          // shift right 2 8..15
> +
> +        uaddl   v2.8H, v21.8B, v22.8B       // pix3 + pix3+1 0..7
> +        uaddl2  v4.8H, v21.16B, v22.16B     // pix3 + pix3+1 8..15
> +        add     v30.8H, v3.8H, v2.8H        // add up 0..7, using pix2 + pix2+1 values from pix3 above
> +        add     v31.8H, v5.8H, v4.8H        // add up 8..15, using pix2 + pix2+1 values from pix3 above
> +        urshr   v30.8H, v30.8H, #2          // shift right 2 0..7 (rounding shift right)
> +        urshr   v31.8H, v31.8H, #2          // shift right 2 8..15
> +
> +        // Averages are now stored in these registers:
> +        // v23, v24
> +        // v26, v27
> +        // v28, v29
> +        // v30, v31
> +        // pix1 values in these registers:
> +        // v1, v6, v17, v20
> +        // available
> +        // v3, v5, v7, v16, v18, v19, v25
> +
> +        uxtl2   v3.8H, v1.16B               // 8->16 bits pix1 8..15
> +        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
> +        uxtl2   v7.8H, v6.16B               // 8->16 bits pix1 8..15
> +        uxtl    v6.8H, v6.8B                // 8->16 bits pix1 0..7
> +        uxtl2   v18.8H, v17.16B             // 8->16 bits pix1 8..15
> +        uxtl    v17.8H, v17.8B              // 8->16 bits pix1 0..7
> +        uxtl2   v25.8H, v20.16B             // 8->16 bits pix1 8..15
> +        uxtl    v20.8H, v20.8B              // 8->16 bits pix1 0..7
> +
> +        uabd    v5.8H, v1.8H, v23.8H        // absolute difference 0..7
> +        uaba    v5.8H, v3.8H, v24.8H        // absolute difference accumulate 8..15
> +        uaba    v5.8H, v6.8H, v26.8H        // absolute difference accumulate 0..7
> +        uaba    v5.8H, v7.8H, v27.8H        // absolute difference accumulate 8..15
> +        uaba    v5.8H, v17.8H, v28.8H       // absolute difference accumulate 0..7
> +        uaba    v5.8H, v18.8H, v29.8H       // absolute difference accumulate 8..15
> +        uaba    v5.8H, v20.8H, v30.8H       // absolute difference accumulate 0..7
> +        uaba    v5.8H, v25.8H, v31.8H       // absolute difference accumulate 8..15
> +
> +        uaddlv  s5, v5.8H                   // add up accumulated values
> +        add     d0, d0, d5                  // add to final result

Same thing as above; try whether deferring the horizontal addv until the 
end is helpful. Also, for in-order cores, it'd be good to avoid direct 
chaining of dependencies like this - you could move e.g. the "sub" from 
below between the uaddlv/add pair. But if you avoid uaddlv altogether, 
it's more straightforward (then it's probably best to place the sub before 
the accumulation of the results).

> +
> +        sub     w4, w4, #4                  // h -= 4
> +        cmp     w4, #4                      // loop if h >= 4

To avoid stalls on in-order cores, consider moving both the sub and the 
cmp further away from each other, and the cmp further away from the b.ge.

> +        b.ge    1b
> +        cbnz    w4, 2f                      // if iterations remain jump to completion section
> +
> +        fmov    w0, s0                      // copy result to general purpose register

If applying Sebastian's suggestions, please apply it similarly to 
ff_pix_abs16_neon above too.

> +        ret
> +2:
> +        // v2 and v4 are set either at the end of this loop or at from the unrolled version
> +        // which branches here to complete iterations when h % 4 != 0.
> +        ldr     q5, [x5, #1]                // load pix3+1
> +        ld1     {v3.16B}, [x5], x3          // load pix3
> +        ld1     {v1.16B}, [x1], x3          // load pix1
> +        sub     w4, w4, #1                  // decrement h
> +
> +        uaddl   v18.8H, v3.8B, v5.8B        // pix3 + pix3+1 0..7
> +        uaddl2  v19.8H, v3.16B, v5.16B      // pix3 + pix3+1 8..15
> +        add     v16.8H, v2.8H, v18.8H       // add up 0..7, using pix2 + pix2+1 values from previous iteration
> +        add     v17.8H, v4.8H, v19.8H       // add up 8..15, using pix2 + pix2+1 values from previous iteration
> +        // divide by 4 to compute the average of values summed above
> +        urshr   v16.8H, v16.8H, #2          // shift right by 2 0..7 (rounding shift right)
> +        urshr   v17.8H, v17.8H, #2          // shift right by 2 8..15
> +
> +        uxtl2   v8.8H, v1.16B               // 8->16 bits pix1 8..15
> +        uxtl    v1.8H, v1.8B                // 8->16 bits pix1 0..7
> +
> +        uabd    v6.8H, v1.8H, v16.8H        // absolute difference 0..7
> +        uaba    v6.8H, v8.8H, v17.8H        // absolute difference accumulate 8..15
> +        addv    h6, v6.8H                   // add up accumulator in v6
> +        add     d0, d0, d6
> +
> +        mov     v2.16B, v18.16B             // pix3 -> pix2
> +        mov     v4.16B, v19.16B             // pix3+1 -> pix2+1

It's probably better for in-order cores if you'd place these mov 
instructions somewhere between the uaba and the following adds.

> +
> +        cbnz    w4, 2b                      // loop if h > 0
> +        fmov    w0, s0                      // copy result to general purpose register
> +        ret
> +endfunc
> diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
> index b2f87d2e1b..60053a1b92 100644
> --- a/libavcodec/me_cmp.c
> +++ b/libavcodec/me_cmp.c
> @@ -1064,6 +1064,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
>         ff_me_cmp_init_alpha(c, avctx);
>     if (ARCH_ARM)
>         ff_me_cmp_init_arm(c, avctx);
> +    if (ARCH_AARCH64)
> +        ff_me_cmp_init_aarch64(c, avctx);

Alphabetical order please; AARCH64 comes before ARM.

>     if (ARCH_PPC)
>         ff_me_cmp_init_ppc(c, avctx);
>     if (ARCH_X86)
> diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
> index e9b5161c9a..4dd059223d 100644
> --- a/libavcodec/me_cmp.h
> +++ b/libavcodec/me_cmp.h
> @@ -82,6 +82,7 @@ typedef struct MECmpContext {
> void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
> +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);

Alphabetical order please.

// Martin