[FFmpeg-devel] [PATCH] lavc/alsdec: Add NEON optimizations

Mon Mar 1 15:16:39 EET 2021

Hi Thilo,

On Sun, 28 Feb 2021, Thilo Borgmann wrote:

> it's my first attempt to do some assembly, it might still includes some dont's of the asm world...
> Tested with gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
>
> Speed-wise, it sees a drop for small prediction orders until around 10 or 11.
> Well, the maximum prediction order is 1023.
> I therefore checked with the "real-world" samples from the fate-suite, which suggests low prediction orders are non-dominant:
>
> pred_order = {7..17}, gain: 23%
>
> als_reconstruct_all_c: 26645.2
> als_reconstruct_all_neon: 20635.2

This is the combination that the patch actually tests by default, if I 
read the code correctly - right?

You didn't write what CPU you tested this on - do note that the actual 
peformance of the assembly is pretty heavily dependent on the CPU.

I get roughly similar numbers if I build with GCC:

                          Cortex A53      A72      A73
als_reconstruct_all_c:     107708.2  44044.5  57427.7
als_reconstruct_all_neon:   78895.7  38464.7  34065.5

However - if I build with Clang, where vectorization isn't disabled by 
configure, the C code beats the handwritten assembly:

                        Cortex A53
als_reconstruct_all_c:    69145.7
als_reconstruct_all_neon: 78895.7

Even if I only test order 17, the C code still is faster. So clearly we 
can do better - if nothing else, we could copy the assembly code that 
Clang outputs :-)

First a couple technical details about the patch...

> new file mode 100644
> index 0000000000..130b1a615e
> --- /dev/null
> +++ b/libavcodec/aarch64/alsdsp_init_aarch64.c
> @@ -0,0 +1,35 @@
> +
> +#include "config.h"
> +
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/alsdsp.h"
> +
> +void ff_alsdsp_reconstruct_all_neon(int32_t *samples, int32_t *samples_end, int32_t *coeffs, unsigned int opt_order);

Nit: Long line?

> diff --git a/libavcodec/aarch64/alsdsp_neon.S b/libavcodec/aarch64/alsdsp_neon.S
> new file mode 100644
> index 0000000000..fe95eaf843
> --- /dev/null
> +++ b/libavcodec/aarch64/alsdsp_neon.S
> @@ -0,0 +1,155 @@
> +
> +#include "libavutil/aarch64/asm.S"
> +#include "neon.S"
> +
> +//void ff_alsdsp_reconstruct_all_neon(int32_t *samples, int32_t *samples_end, int32_t *coeffs, unsigned int opt_order);
> +// x0: int32_t *samples
> +// x1: int32_t *samples_end
> +// x2: int32_t *coeffs
> +// w3: unsigned int opt_order
> +function ff_alsdsp_reconstruct_all_neon, export = 1

Write the named macro argument without extra spaces, i.e. "export=1".
Otherwise this breaks building with gas-preprocessor

> +    sub         sp, sp, #128

Please align instructions and operands in the same way as in other sources.
Also for the operands, I'd recommend aligning the columns according to
the max width for each operand. E.g. like this:

     lsl         x3,  x3,  #32
     neg         x3,  x3,  lsr #32
     lsl         x10, x3,  #2

That way the columns line up nicely regardless of which registers are
used. And for vector register operands, align them so that the max
sized register (v16.16b) would fit. For things that deviate from the
regular form (e.g. loads/stores etc) just align things so it looks
pretty.

> +    st1         {v8.4s  - v11.4s}, [sp], #64
> +    st1         {v12.4s - v15.4s}, [sp], #64

You aren't using registers v16-v31 at all. You could use those and
avoid using v8-v15, to avoid needing to back up and restore these
registers.

> +// avoid 32-bit clubber from register

Nit: The common spelling is "clobber"

> +    lsl         x3, x3, #32
> +    neg         x3, x3, lsr #32

There's normally no need to do such manual cleanup of the
argument that might have junk in the upper half. If you
really need to, you can fix it by doing "uxtl x3, w3" or
"sxtl x3, w3", but in most cases you can avoid it by just
making sure to refer to the register as w3 instead of x3
the first time you use it. If you do a write to a register
in the form wN instead of xN, it will implicitly clear the
upper half, so you could use the xN form after that. That
doesn't work quite as easily here when you want to have it
fully negated though.

But e.g. with something like this, it works just fine:

     neg         w3,  w3
     lsl         w10, w3,  #2
     // Sign extension when used with a 64 bit register
     add         x4,  x0,  w10, sxtw
     add         x5,  x2,  w10, sxtw
     mov         w6,  w3
     // All other uses use w6 instead of x6 etc.

> +// x10 counts the bytes left to read, set to 4 * -opt_order
> +    lsl         x10, x3, #2
> +
> +// loop x0 .. x1
> +1:  cmp         x0, x1
> +    b.eq        4f
> +
> +// samples - opt_order, coeffs - opt_order
> +    add         x4, x0, x10
> +    add         x5, x2, x10
> +// reset local counter: count -opt_order .. 0
> +    mov         x6, x3
> +
> +// reset local acc
> +    movi        v8.2d,  #0
> +    movi        v9.2d,  #0
> +    movi        v10.2d, #0
> +    movi        v11.2d, #0
> +    movi        v12.2d, #0
> +    movi        v13.2d, #0
> +    movi        v14.2d, #0
> +    movi        v15.2d, #0
> +
> +// loop over 16 samples while >= 16 more to read
> +    adds        x6, x6, #16
> +    b.gt        3f
> +
> +2:  ld1         {v0.4s - v3.4s}, [x4], #64
> +    ld1         {v4.4s - v7.4s}, [x5], #64
> +
> +    smlal       v8.2d,  v0.2s, v4.2s
> +    smlal2      v9.2d,  v0.4s, v4.4s
> +    smlal       v10.2d, v1.2s, v5.2s
> +    smlal2      v11.2d, v1.4s, v5.4s
> +    smlal       v12.2d, v2.2s, v6.2s
> +    smlal2      v13.2d, v2.4s, v6.4s
> +    smlal       v14.2d, v3.2s, v7.2s
> +    smlal2      v15.2d, v3.4s, v7.4s
> +
> +    adds        x6, x6, #16
> +    b.le        2b
> +
> +// reduce to four NEON registers
> +// acc values into register
> +3:  subs        x6, x6, #16
> +
> +    add         v4.2d, v8.2d,  v9.2d
> +    add         v5.2d, v10.2d, v11.2d
> +    add         v6.2d, v12.2d, v13.2d
> +    add         v7.2d, v14.2d, v15.2d
> +
> +// next 8 samples
> +    cmn         x6, #8
> +    b.gt        3f
> +
> +    ld1         {v0.4s - v1.4s}, [x4], #32
> +    ld1         {v2.4s - v3.4s}, [x5], #32
> +
> +    smlal       v4.2d, v0.2s, v2.2s
> +    smlal2      v5.2d, v0.4s, v2.4s
> +    smlal       v6.2d, v1.2s, v3.2s
> +    smlal2      v7.2d, v1.4s, v3.4s
> +
> +    adds        x6, x6, #8
> +
> +// reduce to two NEON registers
> +// acc values into register
> +3:  add         v2.2d, v4.2d, v5.2d
> +    add         v3.2d, v6.2d, v7.2d
> +
> +// next 4 samples
> +    cmn         x6, #4
> +    b.gt        3f
> +
> +    ld1         {v0.4s}, [x4], #16
> +    ld1         {v1.4s}, [x5], #16
> +
> +    smlal       v2.2d, v0.2s, v1.2s
> +    smlal2      v3.2d, v0.4s, v1.4s
> +
> +    adds        x6, x6, #4
> +
> +// reduce to A64 registers
> +// acc values into register
> +3:  add         v2.2d, v2.2d, v3.2d
> +    mov         x7, v2.2d[0]
> +    mov         x8, v2.2d[1]

This breaks building both with Clang and with MS armasm64.exe
(via gas-preprocessor); binutils accepts the syntax "v2.2d[0]" here
but the correct form is "v2.d[0]" (as you're only accessing one
lane at a time, it doesn't matter if you see the register as full
or half).

> +    add         x7, x7, x8
> +
> +    cmn         x6, #0
> +    b.eq        3f
> +
> +// loop over the remaining < 4 samples to read
> +2:  ldrsw       x8, [x4], #4
> +    ldrsw       x9, [x5], #4
> +
> +    madd        x7, x8, x9, x7
> +    adds        x6, x6, #1
> +    b.lt        2b
> +
> +// add 1<<19 and store s-=X>>20
> +3:  mov         x9, #1
> +    lsl         x9, x9, #19
> +    add         x7, x7, x9
> +    neg         x7, x7, asr #20
> +
> +    ldrsw       x9, [x4]
> +    add         x9, x9, x7
> +    str         w9, [x4]
> +
> +// increment samples and loop
> +    add         x0, x0, #4
> +    b           1b
> +
> +4:  sub         sp, sp, #128
> +    ld1         {v8.4s  - v11.4s}, [sp], #64
> +    ld1         {v12.4s - v15.4s}, [sp], #64
> +
> +    ret
> +endfunc
> diff --git a/libavcodec/alsdsp.c b/libavcodec/alsdsp.c
> new file mode 100644
> index 0000000000..00270bb5e6
> --- /dev/null
> +++ b/libavcodec/alsdsp.c
> +#include "libavutil/attributes.h"
> +#include "libavutil/samplefmt.h"
> +#include "mathops.h"
> +#include "alsdsp.h"
> +#include "config.h"
> +
> +static void als_reconstruct_all_c(int32_t *raw_samples, int32_t *raw_samples_end, int32_t *lpc_cof, unsigned int opt_order)
> +{
> +	int64_t y;
> +	int sb;
> +
> +        for (; raw_samples < raw_samples_end; raw_samples++) {
> +            y = 1 << 19;
> +
> +            for (sb = -opt_order; sb < 0; sb++)
> +                y += (uint64_t)MUL64(lpc_cof[sb], raw_samples[sb]);
> +
> +            *raw_samples -= y >> 20;
> +        }

This new file uses incorrect indentation and even uses tabs.

> +
> +
> +av_cold void ff_alsdsp_init(ALSDSPContext *ctx)
> +{
> +    ctx->reconstruct_all = als_reconstruct_all_c;
> +
> +    if (ARCH_AARCH64)
> +        ff_alsdsp_init_neon(ctx);

I think the norm here would be to have this function be called
*_aarch64, as it's behind an ARCH_AARCH64 check. (In the future
we could have other SIMD instruction sets on aarch64, that all
would go through the same init function, just like on x86 where
there's SSE* and AVX*.)

> diff --git a/tests/checkasm/alsdsp.c b/tests/checkasm/alsdsp.c
> new file mode 100644
> index 0000000000..f35c7d49be
> --- /dev/null
> +++ b/tests/checkasm/alsdsp.c
> @@ -0,0 +1,81 @@
> +
> +void checkasm_check_alsdsp(void)
> +{
> +    LOCAL_ALIGNED_16(uint32_t, ref_samples, [1024]);
> +    LOCAL_ALIGNED_16(uint32_t, ref_coeffs,  [1024]);
> +    LOCAL_ALIGNED_16(uint32_t, new_samples, [1024]);
> +    LOCAL_ALIGNED_16(uint32_t, new_coeffs,  [1024]);
> +
> +    ALSDSPContext dsp;
> +    ff_alsdsp_init(&dsp);
> +
> +    if (check_func(dsp.reconstruct_all, "als_reconstruct_all")) {
> +        declare_func(void, int32_t *samples, int32_t *samples_end, int32_t *coeffs, unsigned int opt_order);
> +        int32_t *s, *c, *e;
> +        unsigned int o;
> +        unsigned int O[] = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
> +        for (int k = 0; k <11; k++) {

Would it be good to use FF_ARRAY_ELEMS() here instead of hardcoding 11?
That would simplify testing in various configurations.

Alternative you could have "for (o = 7; o <= 17; o++)", but the array form
is nice for testing specific values as long as it's not a very long range.

> +            o = O[k];
> +
> +            randomize_buffers();
> +
> +            s = (int32_t*)(ref_samples + o);

Can't you just declare the arrays as int32_t to avoid these casts?

Now for the actual algorithm - while you do use SIMD instructions for 
doing the multiplication (which is the biggest part when you have long 
filters), your algorithm is serial (you produce one single output sample) 
in the end, and for small filter sizes, this is a significant portion of 
the runtime.

For an even more SIMDy algorithm you would filter and produce e.g. 4 
output samples at a time.

Is it ok to write up to 3 samples past samples_end? If not, can the 
calling code be rewritten that way, to avoid the need for extra edge 
conditions. If not, can the calling code be rearranged to allow that?

Is lpc_coef[] before -opt_order undefined, or can it be arranged so that 
lpc_coef[] is padded with a couple zeros before the first coefficient we 
need to care about? That would allow us to ignore even more boundary 
conditions and e.g. round opt_order up to the nearest multiple of 4 which 
would simplify things even more.

Anyway, the way of filtering multiple samples at a time, while avoiding 
any serial processing, looks like this:

opt_order = FFALIGN(opt_order, 4);

while (samples_left > 0) {
     // Make this >= 4 with serial processing at the end if we aren't
     // allowed to go past the end of the buffer
     ptr = raw_samples - 4*opt_order;
     lpc_coef -= 4*opt_order;
     ld1 {v1.4s}, [ptr], #16 // load 4 samples
     n = opt_order;

     mov v4.2d, #0 // accumulator
     mov v5.2d, #0
     do {
         ld1 {v2.4s}, [ptr], #16 // load 4 more samples
         ld1 {v0.4s}, [lpc_coef], #16 // load 4 coefficients

         ext v16.16b, v1.16b, v2.16b, #4 // Samples v1-v2 offset by 1
         ext v17.16b, v1.16b, v2.16b, #8 // offset by 2
         ext v18.16b, v1.16b, v2.16b, #12 // offset by 3

         smlal  v4.2d,  v1.2s,  v0.s[0]
         smlal2 v5.2d,  v1.2s,  v0.s[0]
         smlal  v4.2d,  v16.2s, v0.s[1]
         smlal2 v5.2d,  v16.2s, v0.s[1]
         smlal  v4.2d,  v17.2s, v0.s[2]
         smlal2 v5.2d,  v17.2s, v0.s[2]
         smlal  v4.2d,  v18.2s, v0.s[3]
         smlal2 v5.2d,  v18.2s, v0.s[3]
         // For in-order cores like A53 and A55, this can also be more
         // efficient if reordered to do all 4 smlal to v4.2d first,
         // followed by 4 smlal2 to v5.2d.

         mov v1, v2 // shift input samples

         n -= 4;
     } while (n > 0);
     // v4-v5 now contains the final sum for 4 output samples, done in parallel

     // Use the rounding shift function for doing the (sum + (1<<19)) >> 20
     rshrn  v4.2s, v4.2d, #20
     rshrn2 v4.4s, v5.2d, #20
     // v2 should be the corresponding output samples at this point
     // Rewind ptr to point at where v2 was loaded from. That way we don't
     // need a separate pointer register for this, we should be able to
     // do with just one register for all input/output to raw_samples.
     sub    ptr, ptr, #16
     sub    v2.4s, v2.4s,  v4.4s
     st1    {v2.4s}, [ptr], #16

     samples_left -= 4;
}

// Martin