[FFmpeg-devel] [PATCH v3 4/5] avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON

Geoff Hill geoff at geoffhill.org
Wed Apr 3 09:43:34 EEST 2024


Signed-off-by: Geoff Hill <geoff at geoffhill.org>
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 +++++
 libavcodec/aarch64/ac3dsp_neon.S         | 24 +++++++++++++++++++++
 tests/checkasm/ac3dsp.c                  | 27 ++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 1bdc215b51..e95436c651 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -28,6 +28,10 @@
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
     c->ac3_exponent_min = ff_ac3_exponent_min_neon;
     c->extract_exponents = ff_ac3_extract_exponents_neon;
     c->float_to_fixed24 = ff_float_to_fixed24_neon;
+    c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index b26f71a3f6..fa8fcf2e47 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -64,3 +64,27 @@ function ff_float_to_fixed24_neon, export=1
         b.ne        0b
         ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        cbz         w3, 1f
+        movi        v0.2d, #0
+        movi        v1.2d, #0
+        movi        v2.2d, #0
+        movi        v3.2d, #0
+0:      ld1         {v4.2s}, [x1], #8
+        ld1         {v5.2s}, [x2], #8
+        add         v6.2s, v4.2s, v5.2s
+        sub         v7.2s, v4.2s, v5.2s
+        smlal       v0.2d, v4.2s, v4.2s
+        smlal       v1.2d, v5.2s, v5.2s
+        smlal       v2.2d, v6.2s, v6.2s
+        smlal       v3.2d, v7.2s, v7.2s
+        subs        w3, w3, #2
+        b.gt        0b
+        addp        d0, v0.2d
+        addp        d1, v1.2d
+        addp        d2, v2.2d
+        addp        d3, v3.2d
+        st1         {v0.1d-v3.1d}, [x0]
+1:      ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index a8a20349f9..c920dc9eb0 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -136,6 +136,32 @@ static void check_float_to_fixed24(AC3DSPContext *c) {
     report("float_to_fixed24");
 }
 
+static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
+#define ELEMS 240
+    LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]);
+    LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]);
+    LOCAL_ALIGNED_16(uint64_t, v1, [4]);
+    LOCAL_ALIGNED_16(uint64_t, v2, [4]);
+
+    declare_func(void, int64_t[4], const int32_t *, const int32_t *, int);
+
+    randomize_i24(lt, ELEMS);
+    randomize_i24(rt, ELEMS);
+
+    if (check_func(c->sum_square_butterfly_int32,
+                   "ac3_sum_square_bufferfly_int32")) {
+        call_ref(v1, lt, rt, ELEMS);
+        call_new(v2, lt, rt, ELEMS);
+
+        if (memcmp(v1, v2, sizeof(int64_t[4])) != 0)
+            fail();
+
+        bench_new(v2, lt, rt, ELEMS);
+    }
+
+    report("ac3_sum_square_butterfly_int32");
+}
+
 void checkasm_check_ac3dsp(void)
 {
     AC3DSPContext c;
@@ -144,4 +170,5 @@ void checkasm_check_ac3dsp(void)
     check_ac3_exponent_min(&c);
     check_ac3_extract_exponents(&c);
     check_float_to_fixed24(&c);
+    check_ac3_sum_square_butterfly_int32(&c);
 }
-- 
2.44.0



More information about the ffmpeg-devel mailing list