[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: Add pix_abs16_x2 neon implementation

Mon Jul 11 22:59:40 EEST 2022

> +        // accumulate the result in d18
> +        add             d18, d18, d16
> +        add             d18, d18, d17
> +        add             d18, d18, d19
> +        add             d18, d18, d21

Did you experiment with distributing these instructions to each of the iteration blocks? It might be marginally faster since you could reduce the data dependencies in adjacent instructions.

-- 
Jonathan Swinney

From: Hubert Mazur <hum at semihalf.com>
Date: Monday, July 11, 2022 at 7:23 AM
To: "ffmpeg-devel at ffmpeg.org" <ffmpeg-devel at ffmpeg.org>
Cc: "Pop, Sebastian" <spop at amazon.com>, "Swinney, Jonathan" <jswinney at amazon.com>, Martin Storsjö <martin at martin.st>, Grzegorz Bernacki <gjb at semihalf.com>, Marcin Wojtas <mw at semihalf.com>
Subject: RE: [EXTERNAL][PATCH 2/2] lavc/aarch64: Add pix_abs16_x2 neon implementation

CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.

Hi, do you have any feedback regarding the patch?
Regards,
Hubert

On Wed, Jun 29, 2022 at 10:25 AM Hubert Mazur <mailto:hum at semihalf.com> wrote:
Provide neon implementation for pix_abs16_x2 function.

Performance tests of implementation are below.
 - pix_abs_0_1_c: 291.9
 - pix_abs_0_1_neon: 73.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <mailto:hum at semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |   3 +
 libavcodec/aarch64/me_cmp_neon.S         | 134 +++++++++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index bec9148a1a..136b008eb7 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -27,6 +27,8 @@ int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                       ptrdiff_t stride, int h);
 int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                       ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);

 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -34,6 +36,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

     if (have_neon(cpu_flags)) {
         c->pix_abs[0][0] = ff_pix_abs16_neon;
+        c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;

         c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a7937bd8be..c2fd94f4b3 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -203,3 +203,137 @@ function ff_pix_abs16_xy2_neon, export=1
         fmov            w0, s0                      // copy result to general purpose register
         ret
 endfunc
+
+function ff_pix_abs16_x2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // x4           int h
+
+        // preserve value of v8-v12 registers
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8, d9, [sp, #-0x10]!
+
+        // initialize buffers
+        movi            d18, #0
+        movi            v20.8h, #1
+        add             x5, x2, #1 // pix2 + 1
+        cmp             w4, #4
+        http://b.lt            2f
+
+// make 4 iterations at once
+1:
+        // v0 - pix1
+        // v1 - pix2
+        // v2 - pix2 + 1
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        ld1             {v2.16b}, [x5], x3
+
+        ld1             {v3.16b}, [x1], x3
+        ld1             {v4.16b}, [x2], x3
+        ld1             {v5.16b}, [x5], x3
+
+        ld1             {v6.16b}, [x1], x3
+        ld1             {v7.16b}, [x2], x3
+        ld1             {v8.16b}, [x5], x3
+
+        ld1             {v9.16b}, [x1], x3
+        ld1             {v10.16b}, [x2], x3
+        ld1             {v11.16b}, [x5], x3
+
+        // abs(pix1[0] - avg2(pix2[0], pix2[1]))
+        // avg2(a,b) = (((a) + (b) + 1) >> 1)
+        // abs(x) = (x < 0 ? -x : x)
+
+        // pix2[0] + pix2[1]
+        uaddl           v30.8h, v1.8b, v2.8b
+        uaddl2          v29.8h, v1.16b, v2.16b
+        // add one to each element
+        add             v30.8h, v30.8h, v20.8h
+        add             v29.8h, v29.8h, v20.8h
+        // divide by 2, narrow width and store in v30
+        uqshrn          v30.8b, v30.8h, #1
+        uqshrn2         v30.16b, v29.8h, #1
+
+        // abs(pix1[0] - avg2(pix2[0], pix2[1]))
+        uabd            v16.16b, v0.16b, v30.16b
+        uaddlv          h16, v16.16b
+
+        // 2nd iteration
+        uaddl           v28.8h, v4.8b, v5.8b
+        uaddl2          v27.8h, v4.16b, v5.16b
+        add             v28.8h, v28.8h, v20.8h
+        add             v27.8h, v27.8h, v20.8h
+
+        uqshrn          v28.8b, v28.8h, #1
+        uqshrn2         v28.16b, v27.8h, #1
+
+        uabd            v17.16b, v3.16b, v28.16b
+        uaddlv          h17, v17.16b
+
+        // 3rd iteration
+        uaddl           v26.8h, v7.8b, v8.8b
+        uaddl2          v25.8h, v7.16b, v8.16b
+        add             v26.8h, v26.8h, v20.8h
+        add             v25.8h, v25.8h, v20.8h
+
+        uqshrn          v26.8b, v26.8h, #1
+        uqshrn2         v26.16b, v25.8h, #1
+
+        uabd            v19.16b, v6.16b, v26.16b
+        uaddlv          h19, v19.16b
+
+        // 4th iteration
+        uaddl           v24.8h, v10.8b, v11.8b
+        uaddl2          v23.8h, v10.16b, v11.16b
+        add             v24.8h, v24.8h, v20.8h
+        add             v23.8h, v23.8h, v20.8h
+
+        uqshrn          v24.8b, v24.8h, #1
+        uqshrn2         v24.16b, v23.8h, #1
+
+        uabd            v21.16b, v9.16b, v24.16b
+        uaddlv          h21, v21.16b
+
+        sub             w4, w4, #4
+
+        // accumulate the result in d18
+        add             d18, d18, d16
+        add             d18, d18, d17
+        add             d18, d18, d19
+        add             d18, d18, d21
+
+        cmp             w4, #4
+        http://b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        ld1             {v2.16b}, [x5], x3
+
+        uaddl           v30.8h, v1.8b, v2.8b
+        uaddl2          v29.8h, v1.16b, v2.16b
+        add             v30.8h, v30.8h, v20.8h
+        add             v29.8h, v29.8h, v20.8h
+
+        uqshrn          v30.8b, v30.8h, #1
+        uqshrn2         v30.16b, v20.8h, #1
+
+        uabd            v28.16b, v0.16b, v30.16b
+        uaddlv          h28, v28.16b
+
+        add             d18, d18, d28
+        subs            w4, w4, #1
+        http://b.ne            2b
+
+3:
+        fmov            w0, s18
+        ldp             d8, d9, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+
+        ret
+endfunc
-- 
2.34.1