[FFmpeg-devel] [PATCH 1/2] lavc/aarch64: add ff_simple_idct{, _add, _put}_neon functions
Matthieu Bouron
matthieu.bouron at gmail.com
Thu Feb 23 17:59:17 EET 2017
---
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/idct.h | 28 +++
libavcodec/aarch64/idctdsp_init_aarch64.c | 40 ++++
libavcodec/aarch64/simple_idct_neon.S | 362 ++++++++++++++++++++++++++++++
libavcodec/idctdsp.c | 2 +
libavcodec/idctdsp.h | 2 +
6 files changed, 436 insertions(+)
create mode 100644 libavcodec/aarch64/idct.h
create mode 100644 libavcodec/aarch64/idctdsp_init_aarch64.c
create mode 100644 libavcodec/aarch64/simple_idct_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 37666b42cb..104bc67802 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -36,6 +36,8 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
+ aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h
new file mode 100644
index 0000000000..05699c2286
--- /dev/null
+++ b/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
new file mode 100644
index 0000000000..e92223e388
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,40 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ if (!avctx->lowres && !high_bit_depth) {
+ if (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+ c->idct_put = ff_simple_idct_put_neon;
+ c->idct_add = ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->perm_type = FF_IDCT_PERM_PARTTRANS;
+ }
+ }
+}
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
new file mode 100644
index 0000000000..52273420f9
--- /dev/null
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron at gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const idct_coeff_neon, align=4
+ .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+ prfm pldl1keep, [\data]
+ mov x10, x30
+ movrel x3, idct_coeff_neon
+ ld1 {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+ br x10
+.endm
+
+.macro smull1 a b c
+ smull \a, \b, \c
+.endm
+
+.macro smlal1 a b c
+ smlal \a, \b, \c
+.endm
+
+.macro smlsl1 a b c
+ smlsl \a, \b, \c
+.endm
+
+.macro idct_col4_top y1 y2 y3 y4 i l
+ smull\i v7.4S, \y3\().\l, z2
+ smull\i v16.4S, \y3\().\l, z6
+ smull\i v17.4S, \y2\().\l, z1
+ add v19.4S, v23.4S, v7.4S
+ smull\i v18.4S, \y2\().\l, z3
+ add v20.4S, v23.4S, v16.4S
+ smull\i v5.4S, \y2\().\l, z5
+ sub v21.4S, v23.4S, v16.4S
+ smull\i v6.4S, \y2\().\l, z7
+ sub v22.4S, v23.4S, v7.4S
+
+ smlal\i v17.4S, \y4\().\l, z3
+ smlsl\i v18.4S, \y4\().\l, z7
+ smlsl\i v5.4S, \y4\().\l, z1
+ smlsl\i v6.4S, \y4\().\l, z5
+.endm
+
+.macro idct_row4_neon y1 y2 y3 y4 pass
+ ld1 {\y1\().2D-\y2\().2D}, [x2], #32
+ movi v23.4S, #1<<2, lsl #8
+ orr v5.16B, \y1\().16B, \y2\().16B
+ ld1 {\y3\().2D, \y4\().2D}, [x2], #32
+ orr v6.16B, \y3\().16B, \y4\().16B
+ orr v5.16B, v5.16B, v6.16B
+ mov x3, v5.D[1]
+ smlal v23.4S, \y1\().4H, z4
+
+ idct_col4_top \y1 \y2 \y3 \y4 1 4H
+
+ cmp x3, #0
+ beq \pass\()f
+
+ smull2 v7.4S, \y1\().8H, z4
+ smlal2 v17.4S, \y2\().8H, z5
+ smlsl2 v18.4S, \y2\().8H, z1
+ smull2 v16.4S, \y3\().8H, z2
+ smlal2 v5.4S, \y2\().8H, z7
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+ smlal2 v6.4S, \y2\().8H, z3
+ smull2 v7.4S, \y3\().8H, z6
+ smlal2 v17.4S, \y4\().8H, z7
+ smlsl2 v18.4S, \y4\().8H, z5
+ smlal2 v5.4S, \y4\().8H, z3
+ smlsl2 v6.4S, \y4\().8H, z1
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+ sub v22.4S, v22.4S, v7.4S
+
+\pass: add \y3\().4S, v19.4S, v17.4S
+ add \y4\().4S, v20.4S, v18.4S
+ shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
+ shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
+ add v7.4S, v21.4S, v5.4S
+ add v16.4S, v22.4S, v6.4S
+ shrn \y3\().4H, v7.4S, #ROW_SHIFT
+ shrn \y4\().4H, v16.4S, #ROW_SHIFT
+ sub v22.4S, v22.4S, v6.4S
+ sub v19.4S, v19.4S, v17.4S
+ sub v21.4S, v21.4S, v5.4S
+ shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
+ sub v20.4S, v20.4S, v18.4S
+ shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
+ shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
+ shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
+
+ trn1 v16.8H, \y1\().8H, \y2\().8H
+ trn2 v17.8H, \y1\().8H, \y2\().8H
+ trn1 v18.8H, \y3\().8H, \y4\().8H
+ trn2 v19.8H, \y3\().8H, \y4\().8H
+ trn1 \y1\().4S, v16.4S, v18.4S
+ trn1 \y2\().4S, v17.4S, v19.4S
+ trn2 \y3\().4S, v16.4S, v18.4S
+ trn2 \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i l
+function idct_col4_neon\i
+ dup v23.4H, z4c
+.if \i == 1
+ add v23.4H, v23.4H, v24.4H
+.else
+ mov v5.D[0], v24.D[1]
+ add v23.4H, v23.4H, v5.4H
+.endif
+ smull v23.4S, v23.4H, z4
+
+ idct_col4_top v24 v25 v26 v27 \i \l
+
+ mov x4, v28.D[\i - 1]
+ mov x5, v29.D[\i - 1]
+ cmp x4, #0
+ beq 1f
+
+ smull\i v7.4S, v28.\l, z4
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+
+1: mov x4, v30.D[\i - 1]
+ cmp x5, #0
+ beq 2f
+
+ smlal\i v17.4S, v29.\l, z5
+ smlsl\i v18.4S, v29.\l, z1
+ smlal\i v5.4S, v29.\l, z7
+ smlal\i v6.4S, v29.\l, z3
+
+2: mov x5, v31.D[\i - 1]
+ cmp x4, #0
+ beq 3f
+
+ smull\i v7.4S, v30.\l, z6
+ smull\i v16.4S, v30.\l, z2
+ add v19.4S, v19.4S, v7.4S
+ sub v22.4S, v22.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+
+3: cmp x5, #0
+ beq 4f
+
+ smlal\i v17.4S, v31.\l, z7
+ smlsl\i v18.4S, v31.\l, z5
+ smlal\i v5.4S, v31.\l, z3
+ smlsl\i v6.4S, v31.\l, z1
+
+4: addhn v7.4H, v19.4S, v17.4S
+ addhn2 v7.8H, v20.4S, v18.4S
+ subhn v18.4H, v20.4S, v18.4S
+ subhn2 v18.8H, v19.4S, v17.4S
+
+ addhn v16.4H, v21.4S, v5.4S
+ addhn2 v16.8H, v22.4S, v6.4S
+ subhn v17.4H, v22.4S, v6.4S
+ subhn2 v17.8H, v21.4S, v5.4S
+
+ ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1 4H
+declare_idct_col4_neon 2 8H
+
+function ff_simple_idct_put_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24 v25 v26 v27 1
+ idct_row4_neon v28 v29 v30 v31 2
+ bl idct_col4_neon1
+
+ sqshrun v1.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v3.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sqshrun v2.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v4.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
+
+ zip1 v16.4S, v1.4S, v2.4S
+ zip2 v17.4S, v1.4S, v2.4S
+
+ st1 {v16.D}[0], [x0], x1
+ st1 {v16.D}[1], [x0], x1
+
+ zip1 v18.4S, v3.4S, v4.4S
+ zip2 v19.4S, v3.4S, v4.4S
+
+ st1 {v17.D}[0], [x0], x1
+ st1 {v17.D}[1], [x0], x1
+ st1 {v18.D}[0], [x0], x1
+ st1 {v18.D}[1], [x0], x1
+ st1 {v19.D}[0], [x0], x1
+ st1 {v19.D}[1], [x0], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24 v25 v26 v27 1
+ idct_row4_neon v28 v29 v30 v31 2
+ bl idct_col4_neon1
+
+ sshr v1.8H, V7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, V7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ mov x9, x0
+ ld1 {v19.D}[0], [x0], x1
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ ld1 {v19.D}[1], [x0], x1
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ ld1 {v20.D}[0], [x0], x1
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ ld1 {v20.D}[1], [x0], x1
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ ld1 {v21.D}[0], [x0], x1
+ uaddw v23.8H, v23.8H, v19.8B
+ uaddw2 v24.8H, v24.8H, v19.16B
+ ld1 {v21.D}[1], [x0], x1
+ sqxtun v23.8B, v23.8H
+ sqxtun2 v23.16B, v24.8H
+ ld1 {v22.D}[0], [x0], x1
+ uaddw v24.8H, v25.8H, v20.8B
+ uaddw2 v25.8H, v26.8H, v20.16B
+ ld1 {v22.D}[1], [x0], x1
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v25.8H
+ st1 {v23.D}[0], [x9], x1
+ uaddw v25.8H, v27.8H, v21.8B
+ uaddw2 v26.8H, v28.8H, v21.16B
+ st1 {v23.D}[1], [x9], x1
+ sqxtun v25.8B, v25.8H
+ sqxtun2 v25.16B, v26.8H
+ st1 {v24.D}[0], [x9], x1
+ uaddw v26.8H, v29.8H, v22.8B
+ uaddw2 v27.8H, v30.8H, v22.16B
+ st1 {v24.D}[1], [x9], x1
+ sqxtun v26.8B, v26.8H
+ sqxtun2 v26.16B, v27.8H
+ st1 {v25.D}[0], [x9], x1
+ st1 {v25.D}[1], [x9], x1
+ st1 {v26.D}[0], [x9], x1
+ st1 {v26.D}[1], [x9], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+ idct_start x0
+
+ mov x2, x0
+ idct_row4_neon v24 v25 v26 v27 1
+ idct_row4_neon v28 v29 v30 v31 2
+ add x2, x2, #-128
+ bl idct_col4_neon1
+
+ sshr v1.8H, v7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, v7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ st1 {v23.2D,V24.2D}, [x2], #32
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ st1 {v25.2D,V26.2D}, [x2], #32
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ st1 {v27.2D,V28.2D}, [x2], #32
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ st1 {v29.2D,V30.2D}, [x2], #32
+
+ idct_end
+endfunc
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 63e9b5216b..37f4640f0d 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -297,6 +297,8 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID)
ff_xvid_idct_init(c, avctx);
+ if (ARCH_AARCH64)
+ ff_idctdsp_init_aarch64(c, avctx, high_bit_depth);
if (ARCH_ALPHA)
ff_idctdsp_init_alpha(c, avctx, high_bit_depth);
if (ARCH_ARM)
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index b180a6762a..e449be310f 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -100,6 +100,8 @@ extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrd
void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx);
+void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth);
void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
--
2.11.1
More information about the ffmpeg-devel
mailing list