[FFmpeg-devel] [PATCH v2 2/2] avcodec/loongarch: add LSX optimization for aac audio encode

Thu Apr 18 10:36:09 EEST 2024

Add functions:
    ff_abs_pow34_lsx
    ff_aac_quantize_bands_lsx

./ffmpeg -f s16le -ac 2 -i ../../1.pcm -c:a aac -f null -
before:37.5x
after:48.1x
---
 libavcodec/aacencdsp.h                        |   3 +
 libavcodec/loongarch/Makefile                 |   2 +
 .../loongarch/aacencdsp_init_loongarch.c      |  33 +++
 libavcodec/loongarch/aacencdsp_loongarch.S    | 254 ++++++++++++++++++
 libavcodec/loongarch/aacencdsp_loongarch.h    |  35 +++
 5 files changed, 327 insertions(+)
 create mode 100644 libavcodec/loongarch/aacencdsp_init_loongarch.c
 create mode 100644 libavcodec/loongarch/aacencdsp_loongarch.S
 create mode 100644 libavcodec/loongarch/aacencdsp_loongarch.h

diff --git a/libavcodec/aacencdsp.h b/libavcodec/aacencdsp.h
index 67836d8cf7..5db27a95a9 100644
--- a/libavcodec/aacencdsp.h
+++ b/libavcodec/aacencdsp.h
@@ -34,6 +34,7 @@ typedef struct AACEncDSPContext {
 
 void ff_aacenc_dsp_init_riscv(AACEncDSPContext *s);
 void ff_aacenc_dsp_init_x86(AACEncDSPContext *s);
+void ff_aacenc_dsp_init_loongarch(AACEncDSPContext *s);
 
 static inline void abs_pow34_v(float *out, const float *in, const int size)
 {
@@ -66,6 +67,8 @@ static inline void ff_aacenc_dsp_init(AACEncDSPContext *s)
     ff_aacenc_dsp_init_riscv(s);
 #elif ARCH_X86
     ff_aacenc_dsp_init_x86(s);
+#elif ARCH_LOONGARCH64
+    ff_aacenc_dsp_init_loongarch(s);
 #endif
 }
 
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07da2964e4..068fd61810 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_HPELDSP)                += loongarch/hpeldsp_init_loongarch.o
 OBJS-$(CONFIG_IDCTDSP)                += loongarch/idctdsp_init_loongarch.o
 OBJS-$(CONFIG_VIDEODSP)               += loongarch/videodsp_init.o
 OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
+OBJS-$(CONFIG_AAC_ENCODER)            += loongarch/aacencdsp_init_loongarch.o
 LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
 LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
                                          loongarch/h264_deblock_lasx.o
@@ -38,3 +39,4 @@ LSX-OBJS-$(CONFIG_H264QPEL)           += loongarch/h264qpel.o \
                                          loongarch/h264qpel_lsx.o
 LSX-OBJS-$(CONFIG_H264CHROMA)         += loongarch/h264chroma.o
 LSX-OBJS-$(CONFIG_H264PRED)           += loongarch/h264intrapred.o
+LSX-OBJS-$(CONFIG_AAC_ENCODER)        += loongarch/aacencdsp_loongarch.o
diff --git a/libavcodec/loongarch/aacencdsp_init_loongarch.c b/libavcodec/loongarch/aacencdsp_init_loongarch.c
new file mode 100644
index 0000000000..5f67a5857d
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_init_loongarch.c
@@ -0,0 +1,33 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "aacencdsp_loongarch.h"
+
+av_cold void ff_aacenc_dsp_init_loongarch(AACEncDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        s->abs_pow34   = ff_abs_pow34_lsx;
+        s->quant_bands = ff_aac_quantize_bands_lsx;
+    }
+}
\ No newline at end of file
diff --git a/libavcodec/loongarch/aacencdsp_loongarch.S b/libavcodec/loongarch/aacencdsp_loongarch.S
new file mode 100644
index 0000000000..b80bb98aa9
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_loongarch.S
@@ -0,0 +1,254 @@
+/*
+ * Loongarch LASX/LSX optimizeds AAC encoder DSP functions
+ *
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+
+/* void ff_abs_pow34_lsx(float *out, const float *in, const int size); */
+// Param, out:a0, in:a1, size:a2
+function ff_abs_pow34_lsx
+    move          t0,      zero  //loop param
+    move          t1,      zero  //data index
+
+    srai.d        t2,      a2,      2
+    beq           zero,    t2,      .FAPL02
+
+.FAPL01:
+    add.d         t3,      a1,      t1
+    fld.s         f0,      t3,      0x00
+    fld.s         f1,      t3,      0x04
+    fld.s         f2,      t3,      0x08
+    fld.s         f3,      t3,      0x0c
+
+    fabs.s        f0,      f0
+    fabs.s        f1,      f1
+    fabs.s        f2,      f2
+    fabs.s        f3,      f3
+
+    vextrins.w    vr0,     vr1,     0x10
+    vextrins.w    vr0,     vr2,     0x20
+    vextrins.w    vr0,     vr3,     0x30
+
+    vfsqrt.s      vr4,     vr0
+    vfmul.s       vr5,     vr0,     vr4
+    vfsqrt.s      vr6,     vr5
+
+    vstx          vr6,     a0,      t1
+
+    addi.d        t1,      t1,      16
+    addi.d        t0,      t0,      1
+    blt           t0,      t2,      .FAPL01
+
+.FAPL02:  /* &2 */
+    andi          t0,      a2,      2
+    beq           zero,    t0,      .FAPL03
+
+    add.d         t3,      a1,      t1
+    add.d         t4,      a0,      t1
+
+    fld.s         f0,      t3,      0x00
+    fld.s         f1,      t3,      0x04
+
+    fabs.s        f0,      f0
+    fabs.s        f1,      f1
+
+    fsqrt.s       f2,      f0
+    fsqrt.s       f3,      f1
+
+    fmul.s        f4,      f0,      f2
+    fmul.s        f5,      f1,      f3
+
+    fsqrt.s       f6,      f4
+    fsqrt.s       f7,      f5
+
+    fld.s         f6,      t4,      0x00
+    fld.s         f7,      t4,      0x04
+
+    addi.d        t1,      t1,      8
+
+.FAPL03:  /* &1 */
+    andi          t0,      a2,      1
+    beq           zero,    t0,      .FAPL04
+
+    fldx.s        f0,      a1,      t1
+
+    fabs.s        f0,      f0
+    fsqrt.s       f2,      f0
+    fmul.s        f4,      f0,      f2
+    fsqrt.s       f6,      f4
+
+    fldx.s        f6,      a0,      t1
+
+    addi.d        t1,      t1,      4
+
+.FAPL04:
+endfunc
+
+
+
+/* void ff_aac_quantize_bands_lsx(int *out, const float *in, const float *scaled,
+                               int size, int is_signed, int maxval, const float Q34,
+                               const float rounding) */
+// param:
+// out:       a0
+// in:        a1
+// scaled:    a2
+// size:      a3
+// is_signed: a4
+// maxval:    a5
+// Q34:       f0
+// rounding:  f1
+function ff_aac_quantize_bands_lsx
+    move          t0,      zero  //loop param
+    move          t1,      zero  //data index
+
+    vpermi.w      vr0,     vr0,     0x00  //Q34
+    vpermi.w      vr1,     vr1,     0x00  //rounding
+
+    srai.d        t2,      a3,      2  ////loop max
+    beq           zero,    t2,      .FAQBL02
+
+.FAQBL01: /* /4 */
+    vldx          vr2,     a2,      t1
+    vfmul.s       vr3,     vr2,     vr0  //qc
+    vfadd.s       vr4,     vr3,     vr1
+
+    movgr2fr.w    f5,      a5
+    ffint.s.w     f5,      f5
+    vpermi.w      vr5,     vr5,     0x00  //maxval
+    vfmin.s       vr6,     vr4,     vr5
+    vfrintrz.s    vr7,     vr6   //(float .0)tmp
+
+    beq           a4,      zero,    .S4ISEND
+
+    fsub.s        f8,      f0,      f0
+    vshuf4i.w     vr8,     vr8,     0x00  //0.0f
+    vldx          vr9,     a1,      t1    //in
+    vextrins.w    vr10,    vr9,     0x01
+    vextrins.w    vr11,    vr9,     0x02
+    vextrins.w    vr12,    vr9,     0x03
+.S4IS00:
+    fcmp.clt.s    $fcc0,   f9,      f8
+    bceqz         $fcc0,   .S4IS01
+    vextrins.w    vr13,    vr7,     0x00
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x00
+.S4IS01:
+    fcmp.clt.s    $fcc1,   f10,     f8
+    bceqz         $fcc1,   .S4IS02
+    vextrins.w    vr13,    vr7,     0x01
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x10
+.S4IS02:
+    fcmp.clt.s    $fcc2,   f11,     f8
+    bceqz         $fcc2,   .S4IS03
+    vextrins.w    vr13,    vr7,     0x02
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x20
+.S4IS03:
+    fcmp.clt.s    $fcc3,   f12,     f8
+    bceqz         $fcc3,   .S4ISEND
+    vextrins.w    vr13,    vr7,     0x03
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x30
+.S4ISEND:
+    vftintrz.w.s  vr14,    vr7
+    vstx          vr14,    a0,      t1
+    addi.d        t1,      t1,      16
+    addi.d        t0,      t0,      1
+    blt           t0,      t2,      .FAQBL01
+
+.FAQBL02: /* /2 */
+    andi          t2,      a3,      2
+    beq           $r0,     t2,      .FAQBL03
+
+    vldx          vr2,     a2,      t1
+    vfmul.s       vr3,     vr2,     vr0  //qc
+    vfadd.s       vr4,     vr3,     vr1
+
+    movgr2fr.w    f5,      a5
+    ffint.s.w     f5,      f5
+    vpermi.w      vr5,     vr5,     0x00  //maxval
+    vfmin.s       vr6,     vr4,     vr5
+    vfrintrz.s    vr7,     vr6   //(float .0)tmp
+
+    beq           a4,      zero,    .S2ISEND
+
+    fsub.s        f8,      f0,      f0
+    vshuf4i.w     vr8,     vr8,     0x00  //0.0f
+    vldx          vr9,     a1,      t1    //in
+    vextrins.w    vr10,    vr9,     0x01
+.S2IS00:
+    fcmp.clt.s    $fcc0,   f9,      f8
+    bceqz         $fcc0,   .S2IS01
+    vextrins.w    vr13,    vr7,     0x00
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x00
+.S2IS01:
+    fcmp.clt.s    $fcc1,   f10,     f8
+    bceqz         $fcc1,   .S2ISEND
+    vextrins.w    vr13,    vr7,     0x01
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x10
+.S2ISEND:
+    vftintrz.w.s  vr14,    vr7
+    vpickve2gr.w  t3,      vr14,    0
+    vpickve2gr.w  t4,      vr14,    1
+    add.d         t7,      a0,      t1
+    st.w          t3,      t7,      0x00
+    st.w          t4,      t7,      0x04
+    addi.d        t1,      t1,      8
+
+.FAQBL03: /* /1 */
+    andi          t2,      a3,      1
+    beq           $r0,     t2,      .FAQBL04
+
+    vldx          vr2,     a2,      t1
+    vfmul.s       vr3,     vr2,     vr0  //qc
+    vfadd.s       vr4,     vr3,     vr1
+
+    movgr2fr.w    f5,      a5
+    ffint.s.w     f5,      f5
+    vpermi.w      vr5,     vr5,     0x00  //maxval
+    vfmin.s       vr6,     vr4,     vr5
+    vfrintrz.s    vr7,     vr6   //(float .0)tmp
+
+    beq           a4,      zero,    .S1ISEND
+
+    fsub.s        f8,      f0,      f0
+    vshuf4i.w     vr8,     vr8,     0x00  //0.0f
+    vldx          vr9,     a1,      t1    //in
+.S1IS00:
+    fcmp.clt.s    $fcc0,   f9,      f8
+    bceqz         $fcc0,   .S1ISEND
+    vextrins.w    vr13,    vr7,     0x00
+    fneg.s        f13,     f13
+    vextrins.w    vr7,     vr13,    0x00
+.S1ISEND:
+    vftintrz.w.s  vr14,    vr7
+    vpickve2gr.w  t3,      vr14,    0
+    stx.w         t3,      a0,      t1
+    addi.d        t1,      t1,      4
+
+.FAQBL04:
+endfunc
\ No newline at end of file
diff --git a/libavcodec/loongarch/aacencdsp_loongarch.h b/libavcodec/loongarch/aacencdsp_loongarch.h
new file mode 100644
index 0000000000..076cd4d247
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_loongarch.h
@@ -0,0 +1,35 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_AACENC_H
+#define AVCODEC_LOONGARCH_AACENC_H
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_lsx(float *out, const float *in, const int size);
+void ff_aac_quantize_bands_lsx(int *out, const float *in, const float *scaled,
+                               int size, int is_signed, int maxval, const float Q34,
+                               const float rounding);
+
+#endif /* AVCODEC_LOONGARCH_AACENC_H */
\ No newline at end of file
-- 
2.20.1