[FFmpeg-devel] [RFC][PATCH] swscale: NEON optimized unscaled rgba to nv12 conversion
Yu Xiaolei
dreifachstein at gmail.com
Wed Dec 4 07:57:30 CET 2013
Added copyright headers.
RGB2YUV coeffs are loaded from SwsContext, but signs are still hardcoded.
If this is not acceptable, I will rewrite it using multiply by scalar
at the cost of several more instructions (widening operations) per
16x2 block.
Conversion is done in unsigned 16bit math. There will be rounding
errors compared to c implementation.
---
libswscale/arm/Makefile | 3 +
libswscale/arm/rgb2yuv_neon.S | 342 +++++++++++++++++++++++++++++++++++++
libswscale/arm/swscale_unscaled.c | 95 ++++++++++
libswscale/swscale_internal.h | 1 +
libswscale/swscale_unscaled.c | 3 +
5 files changed, 444 insertions(+), 0 deletions(-)
create mode 100644 libswscale/arm/Makefile
create mode 100644 libswscale/arm/rgb2yuv_neon.S
create mode 100644 libswscale/arm/swscale_unscaled.c
diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile
new file mode 100644
index 0000000..20f2848
--- /dev/null
+++ b/libswscale/arm/Makefile
@@ -0,0 +1,3 @@
+OBJS += arm/swscale_unscaled.o
+
+NEON-OBJS += arm/rgb2yuv_neon.o
diff --git a/libswscale/arm/rgb2yuv_neon.S b/libswscale/arm/rgb2yuv_neon.S
new file mode 100644
index 0000000..7a0dabf
--- /dev/null
+++ b/libswscale/arm/rgb2yuv_neon.S
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+
+.macro alias name, target, set=1
+ .if \set != 0
+ \name .req \target
+ .else
+ .unreq \name
+ .endif
+.endm
+
+alias q6_l, d12
+alias q6_r, d13
+alias q7_l, d14
+alias q7_r, d15
+alias q8_l, d16
+alias q8_r, d17
+alias q9_l, d18
+alias q9_r, d19
+
+alias q10_l, d20
+alias q11_l, d22
+alias q12_l, d24
+alias q13_l, d26
+alias q14_l, d28
+alias q14_r, d29
+alias q15_l, d30
+alias q15_r, d31
+
+/* q0-q5 uniforms */
+alias CO_RY, d0
+alias CO_GY, d1
+alias CO_BY, d2
+alias CO_NRU, d3
+alias CO_NGU, d4
+alias CO_BU, d5
+alias CO_RV, d6
+alias CO_NGV, d7
+alias CO_NBV, d8
+alias BIAS_Y, d10
+alias BIAS_U, d11
+alias BIAS_V, BIAS_U
+
+/* q6-q8 rgbx 16x1 */
+
+/* downsampled rgb 8x1 */
+alias r16x8_tmp, q10
+alias r8x8_tmp, q10_l
+alias g16x8_tmp, q11
+alias g8x8_tmp, q11_l
+alias b16x8_tmp, q12
+alias b8x8_tmp, q12_l
+
+alias y16x8_tmp, q13
+alias u16x8_tmp, q14
+alias v16x8_tmp, q15
+
+alias y8x8_l, q14_l
+alias y8x8_r, q14_r
+
+alias p_coeff_tbl, r12
+
+.macro load_coeffs
+ vld3.8 {CO_RY[], CO_GY[], CO_BY[]}, [p_coeff_tbl]!
+ vld3.8 {CO_NRU[], CO_NGU[], CO_BU[]}, [p_coeff_tbl]!
+ vld3.8 {CO_RV[], CO_NGV[], CO_NBV[]}, [p_coeff_tbl]!
+.endm
+
+.macro load_biases
+ vmov.u8 BIAS_Y, #16
+ vmov.u8 BIAS_U, #128
+.endm
+
+.macro load_8888_16x1 src, count
+.if \count == 0
+ vld4.8 {q6_l, q7_l, q8_l, q9_l}, [\src]!
+ vld4.8 {q6_r, q7_r, q8_r, q9_r}, [\src]!
+.else
+ vld4.8 {q6_l, q7_l, q8_l, q9_l}, [\src]!
+ vld4.8 {q6_r, q7_r, q8_r, q9_r}, [\src]
+ sub \src, \src, #32
+ add \src, \src, \count, LSL #2
+.endif
+.endm
+
+.macro alias_8x16 name, qw, set=1
+ alias \name\()8x16, \qw, \set
+ alias \name\()8x8_l, \qw\()_l, \set
+ alias \name\()8x8_r, \qw\()_r, \set
+.endm
+
+.macro alias_src_rgbx set=1
+ alias_8x16 r, q6, \set
+ alias_8x16 g, q7, \set
+ alias_8x16 b, q8, \set
+ alias_8x16 x, q9, \set
+.endm
+
+.macro load_rgbx_16x1 src, count
+ load_8888_16x1 \src, \count
+.endm
+
+
+.macro alias_src_bgrx set=1
+ alias_8x16 b, q6, \set
+ alias_8x16 g, q7, \set
+ alias_8x16 r, q8, \set
+ alias_8x16 x, q9, \set
+.endm
+
+.macro load_bgrx_16x1 src, count
+ load_8888_16x1 \src, \count
+.endm
+
+
+.macro alias_dst_nv12 set=1
+ alias u8x8_tmp, x8x8_l, \set
+ alias v8x8_tmp, x8x8_r, \set
+.endm
+
+.macro store_chroma_nv12_8x1 dst, count
+.if \count == 0
+ vst2.i8 {u8x8_tmp, v8x8_tmp}, [\dst]!
+.else
+ vst2.i8 {u8x8_tmp, v8x8_tmp}, [\dst], \count
+.endif
+.endm
+
+.macro alias_dst_nv21 set=1
+ alias v8x8_tmp, x8x8_l, \set
+ alias u8x8_tmp, x8x8_r, \set
+.endm
+
+.macro store_chroma_nv21_8x1 dst, count
+.if \count == 0
+ vst2.i8 {v8x8_tmp, u8x8_tmp}, [\dst]!
+.else
+ vst2.i8 {v8x8_tmp, u8x8_tmp}, [\dst], \count
+.endif
+.endm
+
+.macro store_y_16x1 dst, count
+.if \count == 0
+ vstmia \dst!, {y8x8_l, y8x8_r}
+.else
+ vstmia \dst, {y8x8_l, y8x8_r}
+ add \dst, \dst, \count
+.endif
+.endm
+
+.macro compute_y_8x1 suf
+ vmull.u8 y16x8_tmp, r8x8_\suf, CO_RY
+ vmlal.u8 y16x8_tmp, g8x8_\suf, CO_GY
+ vmlal.u8 y16x8_tmp, b8x8_\suf, CO_BY
+ vrshrn.i16 y8x8_\suf, y16x8_tmp, #8
+ vadd.u8 y8x8_\suf, y8x8_\suf, BIAS_Y
+.endm
+
+.macro compute_u_8x1 suf
+ vmull.u8 u16x8_\suf, b8x8_\suf, CO_BU
+ vmlsl.u8 u16x8_\suf, r8x8_\suf, CO_NRU
+ vmlsl.u8 u16x8_\suf, g8x8_\suf, CO_NGU
+ vrshrn.i16 u8x8_\suf, u16x8_\suf, #8
+ vadd.u8 u8x8_\suf, u8x8_\suf, BIAS_U
+.endm
+
+.macro compute_v_8x1 suf
+ vmull.u8 v16x8_\suf, r8x8_\suf, CO_RV
+ vmlsl.u8 v16x8_\suf, g8x8_\suf, CO_NGV
+ vmlsl.u8 v16x8_\suf, b8x8_\suf, CO_NBV
+ vrshrn.i16 v8x8_\suf, v16x8_\suf, #8
+ vadd.u8 v8x8_\suf, v8x8_\suf, BIAS_V
+.endm
+
+.macro kernel_420_16x2 rgb_fmt, yuv_fmt, p_src0, p_src1, p_y0,
p_y1, p_c, count=0
+ alias_src_\rgb_fmt
+ alias_dst_\yuv_fmt
+
+ load_\rgb_fmt\()_16x1 \p_src0, \count
+
+ compute_y_8x1 l
+ compute_y_8x1 r,
+ store_y_16x1 \p_y0, \count
+
+ // downsample step 1
+ vpaddl.u8 r16x8_tmp, r8x16
+ vpaddl.u8 g16x8_tmp, g8x16
+ vpaddl.u8 b16x8_tmp, b8x16
+
+ load_\rgb_fmt\()_16x1 p_src1, \count
+
+ compute_y_8x1 l
+ compute_y_8x1 r
+ store_y_16x1 \p_y1, \count
+
+ // downsample step 2
+ vpadal.u8 r16x8_tmp, r8x16
+ vpadal.u8 g16x8_tmp, g8x16
+ vpadal.u8 b16x8_tmp, b8x16
+
+ // downsample step 3
+ vrshrn.u16 r8x8_tmp, r16x8_tmp, #2
+ vrshrn.u16 g8x8_tmp, g16x8_tmp, #2
+ vrshrn.u16 b8x8_tmp, b16x8_tmp, #2
+
+ compute_u_8x1 tmp
+ compute_v_8x1 tmp
+
+ store_chroma_\yuv_fmt\()_8x1 \p_c, \count
+
+ alias_dst_\yuv_fmt 0
+ alias_src_\rgb_fmt 0
+.endm
+
+.macro prologue
+ push {r4-r12, lr}
+ vpush {q4-q7}
+.endm
+
+.macro epilogue
+ vpop {q4-q7}
+ pop {r4-r12, pc}
+.endm
+
+.macro load_arg reg, ix
+ ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
+.endm
+
+
+
+/* ()_to_()_neon(const uint8_t *p_src, uint8_t *p_y, uint8_t *p_chroma
+ * int width, int height,
+ * int y_stride, int c_stride, int src_stride,
+ * uint8_t p_coeff_tbl[9]);
+ */
+.macro alias_loop_420sp set=1
+ alias p_src, r0, \set
+ alias p_src0, p_src, \set
+ alias p_y, r1, \set
+ alias p_y0, p_y, \set
+ alias p_c, r2, \set
+ alias width, r3, \set
+ alias header, width, \set
+
+ alias height, r4, \set
+ alias y_stride, r5, \set
+ alias c_stride, r6, \set
+ alias c_padding, c_stride, \set
+ alias src_stride, r7, \set
+
+ alias p_y0_end, r8, \set
+
+ alias src_padding,r9, \set
+ alias y_padding, r10, \set
+
+ alias p_src1, r11, \set
+ alias p_y1, r12, \set
+
+.endm
+
+.macro loop_420sp s_fmt, d_fmt
+
+function \s_fmt\()_to_\d_fmt\()_neon, export=1
+ prologue
+
+ alias_loop_420sp
+
+ load_arg height, 4
+ load_arg y_stride, 5
+ load_arg c_stride, 6
+ load_arg src_stride, 7
+ load_arg p_coeff_tbl, 8
+
+ load_coeffs
+ load_biases
+
+ sub y_padding, y_stride, width
+ sub c_padding, c_stride, width
+ sub src_padding, src_stride, width, LSL #2
+
+ add p_y0_end, p_y0, width
+ and header, width, #15
+
+ add p_y1, p_y0, y_stride
+ add p_src1, p_src0, src_stride
+
+0:
+ cmp header, #0
+ beq 1f
+
+ kernel_420_16x2 \s_fmt, \d_fmt, p_src0, p_src1, p_y0,
p_y1, p_c, header
+
+1:
+ kernel_420_16x2 \s_fmt, \d_fmt, p_src0, p_src1, p_y0, p_y1, p_c
+
+ cmp p_y0, p_y0_end
+ blt 1b
+2:
+ add p_y0, p_y1, y_padding
+ add p_y0_end, p_y1, y_stride
+ add p_c, p_c, c_padding
+ add p_src0, p_src1, src_padding
+
+ add p_y1, p_y0, y_stride
+ add p_src1, p_src0, src_stride
+
+ subs height, height, #2
+
+ bgt 0b
+
+ epilogue
+
+ alias_loop_420sp 0
+
+endfunc
+.endm
+
+ loop_420sp rgbx, nv12
+ loop_420sp rgbx, nv21
+ loop_420sp bgrx, nv12
+ loop_420sp bgrx, nv21
+
diff --git a/libswscale/arm/swscale_unscaled.c
b/libswscale/arm/swscale_unscaled.c
new file mode 100644
index 0000000..94fae9a
--- /dev/null
+++ b/libswscale/arm/swscale_unscaled.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/arm/cpu.h"
+
+extern void rgbx_to_nv12_neon(const uint8_t *p_src, uint8_t *p_y,
uint8_t *p_chroma,
+ int width, int height,
+ int y_stride, int c_stride, int src_stride,
+ uint8_t p_coeff_tbl[9]);
+
+static void get_rgb2yuv_table(SwsContext *context, uint8_t dst[9]) {
+ int32_t *src = context->input_rgb2yuv_table;
+
+ dst[RY_IDX] = RSHIFT(src[RY_IDX], RGB2YUV_SHIFT - 8);
+ dst[GY_IDX] = RSHIFT(src[GY_IDX], RGB2YUV_SHIFT - 8);
+ dst[BY_IDX] = RSHIFT(src[BY_IDX], RGB2YUV_SHIFT - 8);
+ dst[RU_IDX] = - RSHIFT(src[RU_IDX], RGB2YUV_SHIFT - 8);
+ dst[GU_IDX] = - RSHIFT(src[GU_IDX], RGB2YUV_SHIFT - 8);
+ dst[BU_IDX] = RSHIFT(src[BU_IDX], RGB2YUV_SHIFT - 8);
+ dst[RV_IDX] = RSHIFT(src[RV_IDX], RGB2YUV_SHIFT - 8);
+ dst[GV_IDX] = - RSHIFT(src[GV_IDX], RGB2YUV_SHIFT - 8);
+ dst[BV_IDX] = - RSHIFT(src[BV_IDX], RGB2YUV_SHIFT - 8);
+}
+
+static int rgbx_to_nv12_neon_wrapper(SwsContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]) {
+ uint8_t table[9];
+
+ int src_pixel_width = srcStride[0] / 4;
+ int y_pixel_width = dstStride[0];
+ int c_pixel_width = dstStride[1] / 2;
+
+ int aligned_width = FFALIGN(context->srcW, 16);
+ int width;
+
+ if (aligned_width <= src_pixel_width
+ && aligned_width <= y_pixel_width
+ && aligned_width <= c_pixel_width) {
+ width = aligned_width;
+ } else {
+ width = context->srcW;
+ }
+
+ get_rgb2yuv_table(context, table);
+
+ av_log(context, AV_LOG_INFO, "src(%p) y(%p) chroma(%p)\n",
src[0], dst[0], dst[1]);
+ av_log(context, AV_LOG_INFO, "srcStride(%d) yStride(%d) cStride(%d)\n",
+ srcStride[0], dstStride[0], dstStride[1]);
+
+ rgbx_to_nv12_neon(src[0] + srcSliceY * srcStride[0],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[1] + (srcSliceY / 2) * dstStride[1],
+ width, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0],
+ table);
+
+ return 0;
+}
+
+static void get_unscaled_swscale_neon(SwsContext *c) {
+ if (c->srcFormat == AV_PIX_FMT_RGBA
+ && c->dstFormat == AV_PIX_FMT_NV12
+ && (c->srcW >= 16)) {
+ c->swscale = rgbx_to_nv12_neon_wrapper;
+ }
+}
+
+void ff_get_unscaled_swscale_arm(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (have_neon(cpu_flags))
+ get_unscaled_swscale_neon(c);
+}
+
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6ad278e..443615d 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -835,6 +835,7 @@ extern const AVClass sws_context_class;
void ff_get_unscaled_swscale(SwsContext *c);
void ff_get_unscaled_swscale_bfin(SwsContext *c);
void ff_get_unscaled_swscale_ppc(SwsContext *c);
+void ff_get_unscaled_swscale_arm(SwsContext *c);
/**
* Return function pointer to fastest main scaler path function depending
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 4181e0d..ccf9980 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1384,6 +1384,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
ff_get_unscaled_swscale_bfin(c);
if (ARCH_PPC)
ff_get_unscaled_swscale_ppc(c);
+ if (ARCH_ARM)
+ ff_get_unscaled_swscale_arm(c);
+
}
/* Convert the palette to the same packed 32-bit format as the palette */
--
1.7.9
More information about the ffmpeg-devel
mailing list