[FFmpeg-devel] [PATCH v3 1/2] avutil/loongarch: add LSX optimization for aac audio decode
pengxu
pengxu at loongson.cn
Thu Apr 18 12:42:22 EEST 2024
Add functions:
vector_fmul_window_lsx
butterflies_float_lsx
vector_fmul_scalar_lsx
./ffmpeg -i ../../1.aac -f null -
before:482x
after:523x
---
libavutil/float_dsp.c | 2 +
libavutil/float_dsp.h | 1 +
libavutil/loongarch/Makefile | 5 +-
libavutil/loongarch/float_dsp.S | 287 ++++++++++++++++++
libavutil/loongarch/float_dsp.h | 32 ++
.../loongarch/float_dsp_init_loongarch.c | 35 +++
6 files changed, 361 insertions(+), 1 deletion(-)
create mode 100644 libavutil/loongarch/float_dsp.S
create mode 100644 libavutil/loongarch/float_dsp.h
create mode 100644 libavutil/loongarch/float_dsp_init_loongarch.c
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index e9fb023466..7128ff3f96 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -162,6 +162,8 @@ av_cold AVFloatDSPContext *avpriv_float_dsp_alloc(int bit_exact)
ff_float_dsp_init_x86(fdsp);
#elif ARCH_MIPS
ff_float_dsp_init_mips(fdsp);
+#elif ARCH_LOONGARCH64
+ ff_float_dsp_init_loongarch(fdsp);
#endif
return fdsp;
}
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index 342a8715c5..679a930eab 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -206,6 +206,7 @@ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict);
void ff_float_dsp_init_riscv(AVFloatDSPContext *fdsp);
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp);
void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp);
+void ff_float_dsp_init_loongarch(AVFloatDSPContext *fdsp);
/**
* Allocate a float DSP context.
diff --git a/libavutil/loongarch/Makefile b/libavutil/loongarch/Makefile
index 2addd9351c..ae710f0515 100644
--- a/libavutil/loongarch/Makefile
+++ b/libavutil/loongarch/Makefile
@@ -1 +1,4 @@
-OBJS += loongarch/cpu.o
+OBJS += loongarch/cpu.o \
+ loongarch/float_dsp_init_loongarch.o
+
+LSX-OBJS += loongarch/float_dsp.o
diff --git a/libavutil/loongarch/float_dsp.S b/libavutil/loongarch/float_dsp.S
new file mode 100644
index 0000000000..5073c8424f
--- /dev/null
+++ b/libavutil/loongarch/float_dsp.S
@@ -0,0 +1,287 @@
+/*
+ * Loongarch LASX/LSX optimizeds dsp
+ *
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+
+/* void vector_fmul_window(float *dst, const float *src0,
+ const float *src1, const float *win, int len) */
+function vector_fmul_window_lsx
+ addi.d sp, sp, -8
+ st.d $r23, sp, 0
+
+ move t4, a0
+ move t5, a1
+ move t6, a2
+ move t7, a3
+ move t8, a4
+ slli.d t8, t8, 2
+
+ add.d t4, t4, t8
+ add.d t7, t7, t8
+ add.d t5, t5, t8
+
+ add.d a6, $r0, t8
+ addi.d a7, t8, -4
+
+ move a5, $r0
+ srai.d t0, a4, 2
+ beq a5, t0, .VFW02
+
+.VFW01:
+ sub.d t1, t5, a6
+ addi.d t2, a7, -12
+ vld vr1, t1, 0x00 //s0
+ vldx vr2, a2, t2 //s1
+
+ sub.d t1, t7, a6
+ vld vr3, t1, 0x00 //wi
+ vldx vr4, t7, t2 //wj
+
+ vpermi.w vr2, vr2, 0x1b
+ vpermi.w vr4, vr4, 0x1b
+
+ vfmul.s vr5, vr2, vr3
+ vfmsub.s vr5, vr1, vr4, vr5 //dsti
+
+ vfmul.s vr6, vr2, vr4
+ vfmadd.s vr6, vr1, vr3, vr6 //dstj
+
+ vpermi.w vr6, vr6, 0x1b
+
+ sub.d t1, t4, a6
+ vst vr5, t1, 0x00
+ vstx vr6, t4, t2
+
+ addi.d a6, a6, -16
+ addi.d a7, a7, -16
+
+ addi.d a5, a5, 1
+ blt a5, t0, .VFW01
+
+.VFW02:
+ andi t0, a4, 2
+ beq $r0, t0, .VFW03
+
+ sub.d t0, t5, a6
+ addi.d t1, a7, -4
+ add.d t1, t5, t1
+
+ sub.d t2, t7, a6
+ addi.d t3, a7, -4
+ add.d t3, t7, t3
+
+ fld.s f0, t0, 0x00 //s0
+ fld.s f1, t0, 0x04
+
+ fld.s f2, t1, 0x04 //s1
+ fld.s f3, t1, 0x00
+
+ fld.s f4, t2, 0x00 //wi
+ fld.s f5, t2, 0x04
+
+ fld.s f6, t3, 0x04 //wj
+ fld.s f7, t3, 0x00
+
+ fmul.s f8, f2, f4
+ fmsub.s f8, f0, f6, f8 //dsti
+ fmul.s f9, f3, f5
+ fmsub.s f9, f1, f7, f9
+
+ fmul.s f10, f2, f6
+ fmadd.s f10, f0, f4, f10 //dstj
+ fmul.s f11, f3, f7
+ fmadd.s f11, f1, f5, f11
+
+ sub.d t2, t4, a6
+ add.d t3, t4, a7
+ addi.d t3, t3, -4
+
+ fst.s f8, t2, 0x00
+ fst.s f9, t2, 0x04
+ fst.s f10, t3, 0x04
+ fst.s f11, t3, 0x00
+
+ addi.d a6, a6, -2
+ addi.d a7, a7, -2
+
+.VFW03:
+ andi t0, a4, 1
+ beq $r0, t0, .VFW04
+
+ sub.d t0, t5, a6
+
+ fldx.s f0, t5, t0 //s0
+ fldx.s f2, t6, a7 //s1
+ fldx.s f4, t7, t0 //wi
+ fldx.s f6, t7, a7 //wj
+
+ fmul.s f8, f2, f4
+ fmsub.s f8, f0, f6, f8 //dsti
+
+ fmul.s f10, f2, f6
+ fmadd.s f10, f0, f4, f10 //dstj
+
+ sub.d t0, t4, a6
+
+ fst.s f8, t0, 0x00
+ fstx.s f10, t4, a7
+
+ addi.d a6, a6, -1
+ addi.d a7, a7, -1
+
+.VFW04:
+ ld.d $r23, sp, 0
+ addi.d sp, sp, 8
+
+endfunc
+
+
+/* void butterflies_float(float *restrict v1, float *restrict v2,
+ int len) */
+function butterflies_float_lsx
+ move a6, $r0
+ move a7, $r0
+
+ move t4, a0
+ move t5, a1
+ move t6, a2
+
+ srai.d t0, t6, 2
+ beq a6, t0, .BFL02
+
+.BFL01:
+ vldx vr0, t4, a7
+ vldx vr1, t5, a7
+
+ vfsub.s vr3, vr0, vr1
+ vfadd.s vr4, vr0, vr1
+
+ vstx vr4, t4, a7
+ vstx vr3, t5, a7
+
+ addi.d a7, a7, 16
+ addi.d a6, a6, 1
+ blt a6, t0, .BFL01
+
+.BFL02:
+ andi t0, t6, 2
+ beq $r0, t0, .BFL03
+
+ add.d t1, t4, a7
+ add.d t2, t5, a7
+
+ fld.s f0, t1, 0x00
+ fld.s f1, t1, 0x04
+ fld.s f2, t2, 0x00
+ fld.s f3, t2, 0x04
+
+ fsub.s f4, f0, f2
+ fsub.s f5, f1, f3
+ fadd.s f6, f0, f2
+ fadd.s f7, f1, f3
+
+ fst.s f6, t1, 0x00
+ fst.s f7, t1, 0x04
+ fst.s f4, t2, 0x00
+ fst.s f5, t2, 0x04
+
+ addi.d a7, a7, 8
+
+.BFL03:
+ andi t0, t6, 1
+ beq $r0, t0, .BFL04
+
+ fldx.s f0, t4, a7
+ fldx.s f2, t5, a7
+
+ fsub.s f4, f0, f2
+ fadd.s f6, f0, f2
+
+ fstx.s f6, t4, a7
+ fstx.s f4, t5, a7
+
+ addi.d a7, a7, 4
+
+.BFL04:
+endfunc
+
+
+/* void vector_fmul_scalar_lsx(float *dst, const float *src, float mul,
+ int len) */
+function vector_fmul_scalar_lsx
+ move a6, $r0
+ move a7, $r0
+
+ move t4, a0
+ move t5, a1
+ move t6, a2
+
+ vpermi.w vr0, vr0, 0x00
+
+ srai.d t0, t6, 2
+ beq a6, t0, .BFS02
+
+.BFS01:
+ vldx vr1, t5, a7
+
+ vfmul.s vr2, vr1, vr0
+
+ vstx vr2, t4, a7
+
+ addi.d a7, a7, 16
+ addi.d a6, a6, 1
+ blt a6, t0, .BFS01
+
+.BFS02:
+ andi t0, t6, 2
+ beq $r0, t0, .BFS03
+
+ add.d t1, t5, a7
+ add.d t2, t4, a7
+
+ fld.s f1, t1, 0x00
+ fld.s f2, t1, 0x04
+
+ fmul.s f3, f1, f0
+ fmul.s f4, f2, f0
+
+ fst.s f3, t2, 0x00
+ fst.s f4, t2, 0x04
+
+ addi.d a7, a7, 8
+
+.BFS03:
+ andi t0, t6, 1
+ beq $r0, t0, .BFS04
+
+ fldx.s f1, t5, a7
+
+ fmul.s f3, f1, f0
+
+ fstx.s f3, t4, a7
+
+ addi.d a7, a7, 4
+
+.BFS04:
+endfunc
\ No newline at end of file
diff --git a/libavutil/loongarch/float_dsp.h b/libavutil/loongarch/float_dsp.h
new file mode 100644
index 0000000000..644c1f3713
--- /dev/null
+++ b/libavutil/loongarch/float_dsp.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LOONGARCH_FLOAT_DSP_H
+#define AVUTIL_LOONGARCH_FLOAT_DSP_H
+
+#include "libavutil/float_dsp.h"
+
+void vector_fmul_window_lsx(float *dst, const float *src0,
+ const float *src1, const float *win, int len);
+
+void butterflies_float_lsx(float *restrict v1, float *restrict v2, int len);
+
+void vector_fmul_scalar_lsx(float *dst, const float *src, float mul, int len);
+
+#endif /* AVUTIL_LOONGARCH_FLOAT_DSP_H */
\ No newline at end of file
diff --git a/libavutil/loongarch/float_dsp_init_loongarch.c b/libavutil/loongarch/float_dsp_init_loongarch.c
new file mode 100644
index 0000000000..592ba78058
--- /dev/null
+++ b/libavutil/loongarch/float_dsp_init_loongarch.c
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "float_dsp.h"
+#include "libavutil/loongarch/cpu.h"
+
+av_cold void ff_float_dsp_init_loongarch(AVFloatDSPContext *fdsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ fdsp->vector_fmul_window = vector_fmul_window_lsx;
+ fdsp->butterflies_float = butterflies_float_lsx;
+ fdsp->vector_fmul_scalar = vector_fmul_scalar_lsx;
+ }
+}
--
2.20.1
More information about the ffmpeg-devel
mailing list