[FFmpeg-devel] [PATCH] ARM: NEON optimised simple_idct
Mans Rullgard
mans
Mon Aug 25 05:06:33 CEST 2008
---
libavcodec/Makefile | 2 +
libavcodec/armv4l/dsputil_arm.c | 15 ++
libavcodec/armv4l/simple_idct_neon.S | 383 ++++++++++++++++++++++++++++++++++
libavcodec/avcodec.h | 1 +
libavcodec/utils.c | 1 +
5 files changed, 402 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/armv4l/simple_idct_neon.S
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index e749b48..03a5df6 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -431,6 +431,8 @@ ASM_OBJS-$(HAVE_ARMV5TE) += armv4l/simple_idct_armv5te.o \
ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \
+ASM_OBJS-$(HAVE_NEON) += armv4l/simple_idct_neon.o \
+
OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \
sparc/simple_idct_vis.o \
diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c
index 100b89e..2d0759a 100644
--- a/libavcodec/armv4l/dsputil_arm.c
+++ b/libavcodec/armv4l/dsputil_arm.c
@@ -42,6 +42,12 @@ extern void ff_simple_idct_put_armv6(uint8_t *dest, int line_size,
extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
DCTELEM *data);
+extern void ff_simple_idct_neon(DCTELEM *data);
+extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size,
+ DCTELEM *data);
+extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size,
+ DCTELEM *data);
+
/* XXX: local hack */
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
@@ -232,6 +238,8 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
if(idct_algo == FF_IDCT_AUTO){
#if defined(HAVE_IPP)
idct_algo = FF_IDCT_IPP;
+#elif defined(HAVE_NEON)
+ idct_algo = FF_IDCT_SIMPLENEON;
#elif defined(HAVE_ARMV6)
idct_algo = FF_IDCT_SIMPLEARMV6;
#elif defined(HAVE_ARMV5TE)
@@ -272,6 +280,13 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
c->idct = simple_idct_ipp;
c->idct_permutation_type= FF_NO_IDCT_PERM;
#endif
+#ifdef HAVE_NEON
+ } else if (idct_algo==FF_IDCT_SIMPLENEON){
+ c->idct_put= ff_simple_idct_put_neon;
+ c->idct_add= ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->idct_permutation_type = FF_NO_IDCT_PERM;
+#endif
}
}
diff --git a/libavcodec/armv4l/simple_idct_neon.S b/libavcodec/armv4l/simple_idct_neon.S
new file mode 100644
index 0000000..c72d323
--- /dev/null
+++ b/libavcodec/armv4l/simple_idct_neon.S
@@ -0,0 +1,383 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+ .fpu neon
+
+ .macro idct_col4_top
+ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
+ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
+ vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
+ vadd.i32 q11, q15, q7
+ vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
+ vadd.i32 q12, q15, q8
+ vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
+ vsub.i32 q13, q15, q8
+ vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
+ vsub.i32 q14, q15, q7
+
+ vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
+ vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
+ vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
+ vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
+ .endm
+
+ .macro idct_col4_mid1
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q7
+ vsub.i32 q13, q13, q7
+ vadd.i32 q14, q14, q7
+ .endm
+
+ .macro idct_col4_mid2
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+ .endm
+
+ .macro idct_col4_mid3
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q8
+ vadd.i32 q13, q13, q8
+ vsub.i32 q14, q14, q7
+ .endm
+
+ .macro idct_col4_mid4
+ vmlal.s16 q9, d9, w7
+ vmlsl.s16 q10, d9, w5
+ vmlal.s16 q5, d9, w3
+ vmlsl.s16 q6, d9, w1
+ .endm
+
+ .macro idct_col4_mid
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vadd.i32 q11, q11, q7
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vsub.i32 q12, q12, q7
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vsub.i32 q13, q13, q7
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+ vadd.i32 q14, q14, q7
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vadd.i32 q11, q11, q7
+ vmlal.s16 q9, d9, w7
+ vsub.i32 q12, q12, q8
+ vmlsl.s16 q10, d9, w5
+ vadd.i32 q13, q13, q8
+ vmlal.s16 q5, d9, w3
+ vsub.i32 q14, q14, q7
+ vmlsl.s16 q6, d9, w1
+ .endm
+
+ .macro idct_col4_end
+ vadd.i32 q3, q11, q9
+ vadd.i32 q4, q12, q10
+ vadd.i32 q7, q13, q5
+ vadd.i32 q8, q14, q6
+ vsub.i32 q11, q11, q9
+ vsub.i32 q12, q12, q10
+ vsub.i32 q13, q13, q5
+ vsub.i32 q14, q14, q6
+ .endm
+
+ .text
+ .align
+ .type idct_row4_neon, %function
+ .func idct_row4_neon
+idct_row4_neon:
+ vld1.64 {d2,d3}, [a3,:128]!
+ vld1.64 {d4,d5}, [a3,:128]!
+ vld1.64 {d6,d7}, [a3,:128]!
+ vld1.64 {d8,d9}, [a3,:128]!
+ add a3, a3, #-64
+
+ vmov.i32 q15, #(1<<(ROW_SHIFT-1))
+ vorr d10, d3, d5
+ vtrn.16 q1, q2
+ vorr d11, d7, d9
+ vtrn.16 q3, q4
+ vorr d10, d10, d11
+ vtrn.32 q1, q3
+ vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
+ vtrn.32 q2, q4
+ vmov a4, v1, d10
+
+ idct_col4_top
+
+ orrs a4, a4, v1
+ beq 1f
+ idct_col4_mid
+1:
+ vadd.i32 q3, q11, q9
+ vadd.i32 q4, q12, q10
+ vshrn.i32 d2, q3, #ROW_SHIFT
+ vadd.i32 q7, q13, q5
+ vshrn.i32 d4, q4, #ROW_SHIFT
+ vadd.i32 q8, q14, q6
+ vshrn.i32 d6, q7, #ROW_SHIFT
+ vsub.i32 q11, q11, q9
+ vshrn.i32 d8, q8, #ROW_SHIFT
+ vsub.i32 q12, q12, q10
+ vshrn.i32 d9, q11, #ROW_SHIFT
+ vsub.i32 q13, q13, q5
+ vshrn.i32 d7, q12, #ROW_SHIFT
+ vsub.i32 q14, q14, q6
+ vshrn.i32 d5, q13, #ROW_SHIFT
+ vshrn.i32 d3, q14, #ROW_SHIFT
+
+ vtrn.16 q1, q2
+ vtrn.16 q3, q4
+ vtrn.32 q1, q3
+ vtrn.32 q2, q4
+
+ vst1.64 {d2,d3}, [a3,:128]!
+ vst1.64 {d4,d5}, [a3,:128]!
+ vst1.64 {d6,d7}, [a3,:128]!
+ vst1.64 {d8,d9}, [a3,:128]!
+
+ mov pc, lr
+ .endfunc
+
+ .align
+ .type idct_col4_neon, %function
+ .func idct_col4_neon
+idct_col4_neon:
+ mov ip, #16
+ vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */
+ vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */
+ vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */
+ vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */
+ vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */
+ vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */
+ vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */
+ vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */
+
+ vdup.16 d30, w4c
+ vadd.i16 d30, d30, d2
+ vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+
+ ldrd v1, [a3, #-64]
+ ldrd v3, [a3, #-48]
+ orrs v1, v1, v2
+
+ idct_col4_top
+ beq 1f
+ idct_col4_mid1
+1: orrs v3, v3, v4
+ ldrd v1, [a3, #-32]
+ beq 2f
+ idct_col4_mid2
+2: orrs v1, v1, v2
+ ldrd v1, [a3, #-16]
+ beq 3f
+ idct_col4_mid3
+3: orrs v1, v1, v2
+ beq 4f
+ idct_col4_mid4
+4:
+ idct_col4_end
+
+ vshr.s32 q2, q3, #COL_SHIFT
+ vshr.s32 q3, q4, #COL_SHIFT
+ vmovn.i32 d2, q2
+ vshr.s32 q4, q7, #COL_SHIFT
+ vmovn.i32 d3, q3
+ vshr.s32 q5, q8, #COL_SHIFT
+ vmovn.i32 d4, q4
+ vshr.s32 q6, q14, #COL_SHIFT
+ vmovn.i32 d5, q5
+ vshr.s32 q7, q13, #COL_SHIFT
+ vmovn.i32 d6, q6
+ vshr.s32 q8, q12, #COL_SHIFT
+ vmovn.i32 d7, q7
+ vshr.s32 q9, q11, #COL_SHIFT
+ vmovn.i32 d8, q8
+ vmovn.i32 d9, q9
+
+ mov pc, lr
+ .endfunc
+
+ .macro idct_col4_st16
+ mov ip, #16
+ vst1.64 {d2}, [a3,:64], ip
+ vst1.64 {d3}, [a3,:64], ip
+ vst1.64 {d4}, [a3,:64], ip
+ vst1.64 {d5}, [a3,:64], ip
+ vst1.64 {d6}, [a3,:64], ip
+ vst1.64 {d7}, [a3,:64], ip
+ vst1.64 {d8}, [a3,:64], ip
+ vst1.64 {d9}, [a3,:64], ip
+ .endm
+
+ .align
+ .type idct_col4_add8, %function
+ .func idct_col4_add8
+idct_col4_add8:
+ vld1.32 {d10[0]}, [a1,:32], a2
+ vld1.32 {d10[1]}, [a1,:32], a2
+ vld1.32 {d11[0]}, [a1,:32], a2
+ vld1.32 {d11[1]}, [a1,:32], a2
+ vld1.32 {d12[0]}, [a1,:32], a2
+ vld1.32 {d12[1]}, [a1,:32], a2
+ vld1.32 {d13[0]}, [a1,:32], a2
+ vld1.32 {d13[1]}, [a1,:32], a2
+
+ vaddw.u8 q1, q1, d10
+ vaddw.u8 q2, q2, d11
+ vaddw.u8 q3, q3, d12
+ vaddw.u8 q4, q4, d13
+
+ sub a1, a1, a2, lsl #3
+ .endfunc
+
+ .type idct_col4_st8, %function
+ .func idct_col4_st8
+idct_col4_st8:
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q2
+ vqmovun.s16 d4, q3
+ vqmovun.s16 d5, q4
+
+ vst1.32 {d2[0]}, [a1,:32], a2
+ vst1.32 {d2[1]}, [a1,:32], a2
+ vst1.32 {d3[0]}, [a1,:32], a2
+ vst1.32 {d3[1]}, [a1,:32], a2
+ vst1.32 {d4[0]}, [a1,:32], a2
+ vst1.32 {d4[1]}, [a1,:32], a2
+ vst1.32 {d5[0]}, [a1,:32], a2
+ vst1.32 {d5[1]}, [a1,:32], a2
+
+ mov pc, lr
+ .endfunc
+
+ .align 4
+const: .short W1, W2, W3, W4, W5, W6, W7, W4c
+
+ .macro idct_start data
+ push {v1-v4, lr}
+ pld [\data]
+ pld [\data, #64]
+ dmb
+ vpush {d8-d15}
+ adr a4, const
+ vld1.64 {d0,d1}, [a4,:128]
+ .endm
+
+ .macro idct_end
+ vpop {d8-d15}
+ pop {v1-v4, pc}
+ .endm
+
+ .align
+ .global ff_simple_idct_neon
+ .type ff_simple_idct_neon, %function
+ .func ff_simple_idct_neon
+/* void ff_simple_idct_neon(DCTELEM *data); */
+ff_simple_idct_neon:
+ idct_start a1
+
+ mov a3, a1
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add a3, a3, #-128
+ bl idct_col4_neon
+ add a3, a3, #-128
+ idct_col4_st16
+ add a3, a3, #-120
+ bl idct_col4_neon
+ add a3, a3, #-128
+ idct_col4_st16
+
+ idct_end
+ .endfunc
+
+ .align
+ .global ff_simple_idct_put_neon
+ .type ff_simple_idct_put_neon, %function
+ .func ff_simple_idct_put_neon
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+ff_simple_idct_put_neon:
+ idct_start a3
+
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add a3, a3, #-128
+ bl idct_col4_neon
+ bl idct_col4_st8
+ sub a1, a1, a2, lsl #3
+ add a1, a1, #4
+ add a3, a3, #-120
+ bl idct_col4_neon
+ bl idct_col4_st8
+
+ idct_end
+ .endfunc
+
+ .align
+ .global ff_simple_idct_add_neon
+ .type ff_simple_idct_add_neon, %function
+ .func ff_simple_idct_add_neon
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+ff_simple_idct_add_neon:
+ idct_start a3
+
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add a3, a3, #-128
+ bl idct_col4_neon
+ bl idct_col4_add8
+ sub a1, a1, a2, lsl #3
+ add a1, a1, #4
+ add a3, a3, #-120
+ bl idct_col4_neon
+ bl idct_col4_add8
+
+ idct_end
+ .endfunc
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 6691089..08fc699 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1381,6 +1381,7 @@ typedef struct AVCodecContext {
#define FF_IDCT_SIMPLEVIS 18
#define FF_IDCT_WMV2 19
#define FF_IDCT_FAAN 20
+#define FF_IDCT_SIMPLENEON 21
/**
* slice count
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index fa5bf1a..12bb4f3 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -559,6 +559,7 @@ static const AVOption options[]={
{"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
{"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
{"simplearmv6", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV6, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
{"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
{"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
{"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},
--
1.6.0
More information about the ffmpeg-devel
mailing list