[FFmpeg-cvslog] arm64: convert dcadsp neon asm from arm

Janne Grunau git at videolan.org
Sat Jan 2 11:10:59 CET 2016


ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Sat Nov 28 15:23:52 2015 +0100| [c33c1fa8af2b2e82418a06901b6ad17b3d61b73e] | committer: Janne Grunau

arm64: convert dcadsp neon asm from arm

~2% faster dts decoding overall.

                    cortex-a57   cortex-a53
dca_decode_hf_c:    474.8        1659.9
dca_decode_hf_neon: 225.2         301.1
dca_lfe_fir0_c:     913.2        1537.7
dca_lfe_fir0_neon:  286.8         451.9
dca_lfe_fir1_c:     848.7        1711.5
dca_lfe_fir1_neon:  387.1         506.4

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c33c1fa8af2b2e82418a06901b6ad17b3d61b73e
---

 libavcodec/aarch64/Makefile      |    2 +
 libavcodec/aarch64/dcadsp_init.c |   51 ++++++++++++
 libavcodec/aarch64/dcadsp_neon.S |  169 ++++++++++++++++++++++++++++++++++++++
 libavcodec/dcadsp.c              |    2 +
 libavcodec/dcadsp.h              |    1 +
 5 files changed, 225 insertions(+)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index d001b34..0b614a3 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,3 +1,4 @@
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
 OBJS-$(CONFIG_FFT)                      += aarch64/fft_init_aarch64.o
 OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
@@ -15,6 +16,7 @@ OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
 
 ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
 
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o
 NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
new file mode 100644
index 0000000..ad91070
--- /dev/null
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/dcadsp.h"
+
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale);
+
+void ff_decode_hf_neon(float dst[DCA_SUBBANDS][8],
+                       const int32_t vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int32_t scale[DCA_SUBBANDS][2],
+                       intptr_t start, intptr_t end);
+
+av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+        s->decode_hf  = ff_decode_hf_neon;
+    }
+}
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
new file mode 100644
index 0000000..73196d9
--- /dev/null
+++ b/libavcodec/aarch64/dcadsp_neon.S
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2015 Janne Grunau <janne-libav at jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_decode_hf_neon, export=1
+        add             x2,  x2,  x3
+        add             x0,  x0,  x5, lsl #5
+        add             x1,  x1,  x5, lsl #2
+        add             x4,  x4,  x5, lsl #3
+        sub             x6,  x6,  x5
+        ldr             w7,  [x1], #4
+        add             x7,  x2,  x7, lsl #5
+        subs            x6,  x6,  #1
+        b.eq            1f
+        b.gt            2f
+        ret
+2:
+        ldr             w8,  [x1], #4
+        subs            x6,  x6,  #2
+        add             x8,  x2,  x8, lsl #5
+        ld1             {v2.4s},  [x4], #16
+        ld1             {v0.8b},  [x7]
+        ld1             {v4.8b},  [x8]
+        sxtl            v3.8h,  v0.8b
+        sxtl            v7.8h,  v4.8b
+        scvtf           v2.4s,  v2.4s,  #4
+        sxtl            v0.4s,  v3.4h
+        sxtl2           v1.4s,  v3.8h
+        sxtl            v4.4s,  v7.4h
+        sxtl2           v5.4s,  v7.8h
+        scvtf           v0.4s,  v0.4s
+        scvtf           v1.4s,  v1.4s
+        scvtf           v4.4s,  v4.4s
+        scvtf           v5.4s,  v5.4s
+        fmul            v0.4s,  v0.4s,  v2.s[0]
+        fmul            v1.4s,  v1.4s,  v2.s[0]
+        fmul            v4.4s,  v4.4s,  v2.s[2]
+        fmul            v5.4s,  v5.4s,  v2.s[2]
+        b.lt            10f
+
+        ldr             w7,  [x1], #4
+        add             x7,  x2,  x7, lsl #5
+        st1             {v0.4s,v1.4s},  [x0], #32
+        st1             {v4.4s,v5.4s},  [x0], #32
+        b.gt            2b
+1:
+        ldr             w9,  [x4]
+        ld1             {v0.8b},  [x7]
+        scvtf           s2,  w9,  #4
+        sxtl            v3.8h,  v0.8b
+        sxtl            v0.4s,  v3.4h
+        sxtl2           v1.4s,  v3.8h
+        scvtf           v0.4s,  v0.4s
+        scvtf           v1.4s,  v1.4s
+        fmul            v0.4s,  v0.4s,  v2.s[0]
+        fmul            v1.4s,  v1.4s,  v2.s[0]
+        st1             {v0.4s,v1.4s},  [x0]
+        ret
+10:
+        st1             {v0.4s,v1.4s},  [x0], #32
+        st1             {v4.4s,v5.4s},  [x0]
+        ret
+endfunc
+
+function ff_dca_lfe_fir0_neon, export=1
+        mov             x3,  #32                // decifactor
+        sub             x1,  x1,  #7*4
+        add             x4,  x0,  #2*32*4 - 16  // out2
+        mov             x7,  #-16
+
+        ld1             {v0.4s,v1.4s}, [x1]
+        // reverse [-num_coeffs + 1, 0]
+        ext             v3.16b, v0.16b, v0.16b, #8
+        ext             v2.16b, v1.16b, v1.16b, #8
+        rev64           v3.4s,  v3.4s
+        rev64           v2.4s,  v2.4s
+1:
+        ld1             {v4.4s,v5.4s}, [x2], #32
+        ld1             {v6.4s,v7.4s}, [x2], #32
+        subs            x3,  x3,  #4
+        fmul            v16.4s, v2.4s,  v4.4s
+        fmul            v23.4s, v0.4s,  v4.4s
+        fmul            v17.4s, v2.4s,  v6.4s
+        fmul            v22.4s, v0.4s,  v6.4s
+
+        fmla            v16.4s, v3.4s,  v5.4s
+        fmla            v23.4s, v1.4s,  v5.4s
+        ld1             {v4.4s,v5.4s}, [x2], #32
+        fmla            v17.4s, v3.4s,  v7.4s
+        fmla            v22.4s, v1.4s,  v7.4s
+        ld1             {v6.4s,v7.4s}, [x2], #32
+        fmul            v18.4s, v2.4s,  v4.4s
+        fmul            v21.4s, v0.4s,  v4.4s
+        fmul            v19.4s, v2.4s,  v6.4s
+        fmul            v20.4s, v0.4s,  v6.4s
+
+        fmla            v18.4s, v3.4s,  v5.4s
+        fmla            v21.4s, v1.4s,  v5.4s
+        fmla            v19.4s, v3.4s,  v7.4s
+        fmla            v20.4s, v1.4s,  v7.4s
+
+        faddp           v16.4s, v16.4s, v17.4s
+        faddp           v18.4s, v18.4s, v19.4s
+        faddp           v20.4s, v20.4s, v21.4s
+        faddp           v22.4s, v22.4s, v23.4s
+        faddp           v16.4s, v16.4s, v18.4s
+        faddp           v20.4s, v20.4s, v22.4s
+
+        st1             {v16.4s}, [x0], #16
+        st1             {v20.4s}, [x4], x7
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_dca_lfe_fir1_neon, export=1
+        mov             x3,  #64                // decifactor
+        sub             x1,  x1,  #3*4
+        add             x4,  x0,  #2*64*4 - 16  // out2
+        mov             x7,  #-16
+
+        ld1             {v0.4s}, [x1]
+        // reverse [-num_coeffs + 1, 0]
+        ext             v1.16b, v0.16b, v0.16b, #8
+        rev64           v1.4s,  v1.4s
+
+1:
+        ld1             {v4.4s,v5.4s}, [x2], #32
+        ld1             {v6.4s,v7.4s}, [x2], #32
+        subs            x3,  x3,  #4
+        fmul            v16.4s, v1.4s,  v4.4s
+        fmul            v23.4s, v0.4s,  v4.4s
+        fmul            v17.4s, v1.4s,  v5.4s
+        fmul            v22.4s, v0.4s,  v5.4s
+        fmul            v18.4s, v1.4s,  v6.4s
+        fmul            v21.4s, v0.4s,  v6.4s
+        fmul            v19.4s, v1.4s,  v7.4s
+        fmul            v20.4s, v0.4s,  v7.4s
+        faddp           v16.4s, v16.4s, v17.4s
+        faddp           v18.4s, v18.4s, v19.4s
+        faddp           v20.4s, v20.4s, v21.4s
+        faddp           v22.4s, v22.4s, v23.4s
+        faddp           v16.4s, v16.4s, v18.4s
+        faddp           v20.4s, v20.4s, v22.4s
+        st1             {v16.4s}, [x0], #16
+        st1             {v20.4s}, [x4], x7
+        b.gt            1b
+
+        ret
+endfunc
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 34b5da2..9105a4c 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -110,6 +110,8 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
     s->qmf_32_subbands = dca_qmf_32_subbands;
     s->decode_hf       = decode_hf_c;
 
+    if (ARCH_AARCH64)
+        ff_dcadsp_init_aarch64(s);
     if (ARCH_ARM)
         ff_dcadsp_init_arm(s);
     if (ARCH_X86)
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index 0fa75a5..0669128 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -40,6 +40,7 @@ typedef struct DCADSPContext {
 } DCADSPContext;
 
 void ff_dcadsp_init(DCADSPContext *s);
+void ff_dcadsp_init_aarch64(DCADSPContext *s);
 void ff_dcadsp_init_arm(DCADSPContext *s);
 void ff_dcadsp_init_x86(DCADSPContext *s);
 



More information about the ffmpeg-cvslog mailing list