[FFmpeg-devel] [RFC][PATCH] swscale: NEON optimized unscaled rgba to nv12 conversion

Wed Dec 4 07:57:30 CET 2013

Added copyright headers.

RGB2YUV coeffs are loaded from SwsContext, but signs are still hardcoded.
If this is not acceptable, I will rewrite it using multiply by scalar
at the cost of several more instructions (widening operations) per
16x2 block.

Conversion is done in unsigned 16bit math. There will be rounding
errors compared to c implementation.

---
 libswscale/arm/Makefile           |    3 +
 libswscale/arm/rgb2yuv_neon.S     |  342 +++++++++++++++++++++++++++++++++++++
 libswscale/arm/swscale_unscaled.c |   95 ++++++++++
 libswscale/swscale_internal.h     |    1 +
 libswscale/swscale_unscaled.c     |    3 +
 5 files changed, 444 insertions(+), 0 deletions(-)
 create mode 100644 libswscale/arm/Makefile
 create mode 100644 libswscale/arm/rgb2yuv_neon.S
 create mode 100644 libswscale/arm/swscale_unscaled.c

diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile
new file mode 100644
index 0000000..20f2848
--- /dev/null
+++ b/libswscale/arm/Makefile
@@ -0,0 +1,3 @@
+OBJS        += arm/swscale_unscaled.o
+
+NEON-OBJS   += arm/rgb2yuv_neon.o
diff --git a/libswscale/arm/rgb2yuv_neon.S b/libswscale/arm/rgb2yuv_neon.S
new file mode 100644
index 0000000..7a0dabf
--- /dev/null
+++ b/libswscale/arm/rgb2yuv_neon.S
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+
+.macro alias name, target, set=1
+    .if \set != 0
+        \name   .req    \target
+    .else
+        .unreq  \name
+    .endif
+.endm
+
+alias   q6_l,   d12
+alias   q6_r,   d13
+alias   q7_l,   d14
+alias   q7_r,   d15
+alias   q8_l,   d16
+alias   q8_r,   d17
+alias   q9_l,   d18
+alias   q9_r,   d19
+
+alias   q10_l,  d20
+alias   q11_l,  d22
+alias   q12_l,  d24
+alias   q13_l,  d26
+alias   q14_l,  d28
+alias   q14_r,  d29
+alias   q15_l,  d30
+alias   q15_r,  d31
+
+/* q0-q5 uniforms */
+alias   CO_RY,  d0
+alias   CO_GY,  d1
+alias   CO_BY,  d2
+alias   CO_NRU, d3
+alias   CO_NGU, d4
+alias   CO_BU,  d5
+alias   CO_RV,  d6
+alias   CO_NGV, d7
+alias   CO_NBV, d8
+alias   BIAS_Y, d10
+alias   BIAS_U, d11
+alias   BIAS_V, BIAS_U
+
+/* q6-q8 rgbx 16x1 */
+
+/* downsampled rgb 8x1 */
+alias   r16x8_tmp,  q10
+alias   r8x8_tmp,   q10_l
+alias   g16x8_tmp,  q11
+alias   g8x8_tmp,   q11_l
+alias   b16x8_tmp,  q12
+alias   b8x8_tmp,   q12_l
+
+alias   y16x8_tmp,  q13
+alias   u16x8_tmp,  q14
+alias   v16x8_tmp,  q15
+
+alias   y8x8_l,    q14_l
+alias   y8x8_r,    q14_r
+
+alias   p_coeff_tbl,    r12
+
+.macro load_coeffs
+    vld3.8      {CO_RY[],  CO_GY[], CO_BY[]},   [p_coeff_tbl]!
+    vld3.8      {CO_NRU[], CO_NGU[], CO_BU[]},  [p_coeff_tbl]!
+    vld3.8      {CO_RV[],  CO_NGV[], CO_NBV[]}, [p_coeff_tbl]!
+.endm
+
+.macro load_biases
+    vmov.u8     BIAS_Y, #16
+    vmov.u8     BIAS_U, #128
+.endm
+
+.macro load_8888_16x1   src, count
+.if \count == 0
+    vld4.8      {q6_l, q7_l, q8_l, q9_l},  [\src]!
+    vld4.8      {q6_r, q7_r, q8_r, q9_r},  [\src]!
+.else
+    vld4.8      {q6_l, q7_l, q8_l, q9_l},  [\src]!
+    vld4.8      {q6_r, q7_r, q8_r, q9_r},  [\src]
+    sub         \src,   \src,   #32
+    add         \src,   \src,   \count, LSL #2
+.endif
+.endm
+
+.macro  alias_8x16 name, qw, set=1
+    alias   \name\()8x16, \qw, \set
+    alias   \name\()8x8_l, \qw\()_l, \set
+    alias   \name\()8x8_r, \qw\()_r, \set
+.endm
+
+.macro alias_src_rgbx set=1
+    alias_8x16  r, q6, \set
+    alias_8x16  g, q7, \set
+    alias_8x16  b, q8, \set
+    alias_8x16  x, q9, \set
+.endm
+
+.macro load_rgbx_16x1   src, count
+    load_8888_16x1  \src, \count
+.endm
+
+
+.macro alias_src_bgrx set=1
+    alias_8x16  b, q6, \set
+    alias_8x16  g, q7, \set
+    alias_8x16  r, q8, \set
+    alias_8x16  x, q9, \set
+.endm
+
+.macro load_bgrx_16x1   src, count
+    load_8888_16x1  \src, \count
+.endm
+
+
+.macro alias_dst_nv12   set=1
+    alias   u8x8_tmp, x8x8_l, \set
+    alias   v8x8_tmp, x8x8_r, \set
+.endm
+
+.macro store_chroma_nv12_8x1    dst, count
+.if \count == 0
+    vst2.i8     {u8x8_tmp, v8x8_tmp},   [\dst]!
+.else
+    vst2.i8     {u8x8_tmp, v8x8_tmp},   [\dst], \count
+.endif
+.endm
+
+.macro alias_dst_nv21   set=1
+    alias   v8x8_tmp, x8x8_l, \set
+    alias   u8x8_tmp, x8x8_r, \set
+.endm
+
+.macro store_chroma_nv21_8x1    dst, count
+.if \count == 0
+    vst2.i8     {v8x8_tmp, u8x8_tmp},   [\dst]!
+.else
+    vst2.i8     {v8x8_tmp, u8x8_tmp},   [\dst], \count
+.endif
+.endm
+
+.macro store_y_16x1  dst, count
+.if \count == 0
+    vstmia      \dst!,  {y8x8_l, y8x8_r}
+.else
+    vstmia      \dst,   {y8x8_l, y8x8_r}
+    add         \dst,   \dst,   \count
+.endif
+.endm
+
+.macro compute_y_8x1    suf
+    vmull.u8    y16x8_tmp,  r8x8_\suf,  CO_RY
+    vmlal.u8    y16x8_tmp,  g8x8_\suf,  CO_GY
+    vmlal.u8    y16x8_tmp,  b8x8_\suf,  CO_BY
+    vrshrn.i16  y8x8_\suf,  y16x8_tmp,  #8
+    vadd.u8     y8x8_\suf,  y8x8_\suf,  BIAS_Y
+.endm
+
+.macro compute_u_8x1    suf
+    vmull.u8    u16x8_\suf, b8x8_\suf,  CO_BU
+    vmlsl.u8    u16x8_\suf, r8x8_\suf,  CO_NRU
+    vmlsl.u8    u16x8_\suf, g8x8_\suf,  CO_NGU
+    vrshrn.i16  u8x8_\suf,  u16x8_\suf, #8
+    vadd.u8     u8x8_\suf,  u8x8_\suf,  BIAS_U
+.endm
+
+.macro compute_v_8x1    suf
+    vmull.u8    v16x8_\suf, r8x8_\suf,  CO_RV
+    vmlsl.u8    v16x8_\suf, g8x8_\suf,  CO_NGV
+    vmlsl.u8    v16x8_\suf, b8x8_\suf,  CO_NBV
+    vrshrn.i16  v8x8_\suf,  v16x8_\suf, #8
+    vadd.u8     v8x8_\suf,  v8x8_\suf,  BIAS_V
+.endm
+
+.macro kernel_420_16x2      rgb_fmt, yuv_fmt, p_src0, p_src1, p_y0,
p_y1, p_c, count=0
+    alias_src_\rgb_fmt
+    alias_dst_\yuv_fmt
+
+    load_\rgb_fmt\()_16x1   \p_src0, \count
+
+    compute_y_8x1   l
+    compute_y_8x1   r,
+    store_y_16x1    \p_y0, \count
+
+    // downsample step 1
+    vpaddl.u8   r16x8_tmp,  r8x16
+    vpaddl.u8   g16x8_tmp,  g8x16
+    vpaddl.u8   b16x8_tmp,  b8x16
+
+    load_\rgb_fmt\()_16x1    p_src1, \count
+
+    compute_y_8x1   l
+    compute_y_8x1   r
+    store_y_16x1    \p_y1, \count
+
+    // downsample step 2
+    vpadal.u8   r16x8_tmp,  r8x16
+    vpadal.u8   g16x8_tmp,  g8x16
+    vpadal.u8   b16x8_tmp,  b8x16
+
+    // downsample step 3
+    vrshrn.u16  r8x8_tmp,   r16x8_tmp,  #2
+    vrshrn.u16  g8x8_tmp,   g16x8_tmp,  #2
+    vrshrn.u16  b8x8_tmp,   b16x8_tmp,  #2
+
+    compute_u_8x1           tmp
+    compute_v_8x1           tmp
+
+    store_chroma_\yuv_fmt\()_8x1    \p_c, \count
+
+    alias_dst_\yuv_fmt 0
+    alias_src_\rgb_fmt 0
+.endm
+
+.macro prologue
+    push            {r4-r12, lr}
+    vpush           {q4-q7}
+.endm
+
+.macro epilogue
+    vpop            {q4-q7}
+    pop             {r4-r12, pc}
+.endm
+
+.macro  load_arg    reg, ix
+    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
+.endm
+
+
+
+/* ()_to_()_neon(const uint8_t *p_src, uint8_t *p_y, uint8_t *p_chroma
+ *                  int width, int height,
+ *                  int y_stride, int c_stride, int src_stride,
+ *                  uint8_t p_coeff_tbl[9]);
+ */
+.macro  alias_loop_420sp set=1
+    alias   p_src,      r0, \set
+    alias   p_src0,     p_src, \set
+    alias   p_y,        r1, \set
+    alias   p_y0,       p_y, \set
+    alias   p_c,        r2, \set
+    alias   width,      r3, \set
+    alias   header,     width, \set
+
+    alias   height,     r4, \set
+    alias   y_stride,   r5, \set
+    alias   c_stride,   r6, \set
+    alias   c_padding,  c_stride, \set
+    alias   src_stride, r7, \set
+
+    alias   p_y0_end,   r8, \set
+
+    alias   src_padding,r9, \set
+    alias   y_padding,  r10, \set
+
+    alias   p_src1,     r11, \set
+    alias   p_y1,       r12, \set
+
+.endm
+
+.macro  loop_420sp s_fmt, d_fmt
+
+function \s_fmt\()_to_\d_fmt\()_neon, export=1
+    prologue
+
+    alias_loop_420sp
+
+    load_arg    height,         4
+    load_arg    y_stride,       5
+    load_arg    c_stride,       6
+    load_arg    src_stride,     7
+    load_arg    p_coeff_tbl,    8
+
+    load_coeffs
+    load_biases
+
+    sub         y_padding,      y_stride,       width
+    sub         c_padding,      c_stride,       width
+    sub         src_padding,    src_stride,     width, LSL #2
+
+    add         p_y0_end,       p_y0,           width
+    and         header,         width,          #15
+
+    add         p_y1,           p_y0,           y_stride
+    add         p_src1,         p_src0,         src_stride
+
+0:
+    cmp         header,     #0
+    beq         1f
+
+    kernel_420_16x2         \s_fmt, \d_fmt, p_src0, p_src1, p_y0,
p_y1, p_c, header
+
+1:
+    kernel_420_16x2         \s_fmt, \d_fmt, p_src0, p_src1, p_y0, p_y1, p_c
+
+    cmp         p_y0,       p_y0_end
+    blt         1b
+2:
+    add         p_y0,       p_y1,       y_padding
+    add         p_y0_end,   p_y1,       y_stride
+    add         p_c,        p_c,        c_padding
+    add         p_src0,     p_src1,     src_padding
+
+    add         p_y1,       p_y0,       y_stride
+    add         p_src1,     p_src0,     src_stride
+
+    subs        height,     height,     #2
+
+    bgt         0b
+
+    epilogue
+
+    alias_loop_420sp 0
+
+endfunc
+.endm
+
+    loop_420sp  rgbx, nv12
+    loop_420sp  rgbx, nv21
+    loop_420sp  bgrx, nv12
+    loop_420sp  bgrx, nv21
+
diff --git a/libswscale/arm/swscale_unscaled.c
b/libswscale/arm/swscale_unscaled.c
new file mode 100644
index 0000000..94fae9a
--- /dev/null
+++ b/libswscale/arm/swscale_unscaled.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/arm/cpu.h"
+
+extern void rgbx_to_nv12_neon(const uint8_t *p_src, uint8_t *p_y,
uint8_t *p_chroma,
+                int width, int height,
+                int y_stride, int c_stride, int src_stride,
+                uint8_t p_coeff_tbl[9]);
+
+static void get_rgb2yuv_table(SwsContext *context, uint8_t dst[9]) {
+    int32_t *src = context->input_rgb2yuv_table;
+
+    dst[RY_IDX] = RSHIFT(src[RY_IDX], RGB2YUV_SHIFT - 8);
+    dst[GY_IDX] = RSHIFT(src[GY_IDX], RGB2YUV_SHIFT - 8);
+    dst[BY_IDX] = RSHIFT(src[BY_IDX], RGB2YUV_SHIFT - 8);
+    dst[RU_IDX] = - RSHIFT(src[RU_IDX], RGB2YUV_SHIFT - 8);
+    dst[GU_IDX] = - RSHIFT(src[GU_IDX], RGB2YUV_SHIFT - 8);
+    dst[BU_IDX] = RSHIFT(src[BU_IDX], RGB2YUV_SHIFT - 8);
+    dst[RV_IDX] = RSHIFT(src[RV_IDX], RGB2YUV_SHIFT - 8);
+    dst[GV_IDX] = - RSHIFT(src[GV_IDX], RGB2YUV_SHIFT - 8);
+    dst[BV_IDX] = - RSHIFT(src[BV_IDX], RGB2YUV_SHIFT - 8);
+}
+
+static int rgbx_to_nv12_neon_wrapper(SwsContext *context, const uint8_t *src[],
+                        int srcStride[], int srcSliceY, int srcSliceH,
+                        uint8_t *dst[], int dstStride[]) {
+    uint8_t table[9];
+
+    int src_pixel_width = srcStride[0] / 4;
+    int y_pixel_width = dstStride[0];
+    int c_pixel_width = dstStride[1] / 2;
+
+    int aligned_width = FFALIGN(context->srcW, 16);
+    int width;
+
+    if (aligned_width <= src_pixel_width
+            && aligned_width <= y_pixel_width
+            && aligned_width <= c_pixel_width) {
+        width = aligned_width;
+    } else {
+        width = context->srcW;
+    }
+
+    get_rgb2yuv_table(context, table);
+
+    av_log(context, AV_LOG_INFO, "src(%p) y(%p) chroma(%p)\n",
src[0], dst[0], dst[1]);
+    av_log(context, AV_LOG_INFO, "srcStride(%d) yStride(%d) cStride(%d)\n",
+            srcStride[0], dstStride[0], dstStride[1]);
+
+    rgbx_to_nv12_neon(src[0] + srcSliceY * srcStride[0],
+            dst[0] + srcSliceY * dstStride[0],
+            dst[1] + (srcSliceY / 2) * dstStride[1],
+            width, srcSliceH,
+            dstStride[0], dstStride[1], srcStride[0],
+            table);
+
+    return 0;
+}
+
+static void get_unscaled_swscale_neon(SwsContext *c) {
+    if (c->srcFormat == AV_PIX_FMT_RGBA
+            && c->dstFormat == AV_PIX_FMT_NV12
+            && (c->srcW >= 16)) {
+        c->swscale = rgbx_to_nv12_neon_wrapper;
+    }
+}
+
+void ff_get_unscaled_swscale_arm(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags))
+        get_unscaled_swscale_neon(c);
+}
+
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6ad278e..443615d 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -835,6 +835,7 @@ extern const AVClass sws_context_class;
 void ff_get_unscaled_swscale(SwsContext *c);
 void ff_get_unscaled_swscale_bfin(SwsContext *c);
 void ff_get_unscaled_swscale_ppc(SwsContext *c);
+void ff_get_unscaled_swscale_arm(SwsContext *c);

 /**
  * Return function pointer to fastest main scaler path function depending
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 4181e0d..ccf9980 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1384,6 +1384,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
         ff_get_unscaled_swscale_bfin(c);
     if (ARCH_PPC)
         ff_get_unscaled_swscale_ppc(c);
+    if (ARCH_ARM)
+        ff_get_unscaled_swscale_arm(c);
+
 }

 /* Convert the palette to the same packed 32-bit format as the palette */
-- 
1.7.9