[FFmpeg-cvslog] Add V210 SIMD

Kieran Kunhya git at videolan.org
Wed Oct 19 20:35:51 CEST 2011


ffmpeg | branch: master | Kieran Kunhya <kieran at kunhya.com> | Tue Oct 18 19:50:49 2011 +0100| [44d27736fcd3c53ea102847368e609b96e6eda86] | committer: Michael Niedermayer

Add V210 SIMD

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=44d27736fcd3c53ea102847368e609b96e6eda86
---

 libavcodec/v210dec.c       |   89 +++++++++++++++++++++++++++++++++++--------
 libavcodec/v210dec.h       |   34 +++++++++++++++++
 libavcodec/x86/Makefile    |    2 +
 libavcodec/x86/v210-init.c |   48 +++++++++++++++++++++++
 libavcodec/x86/v210.asm    |   85 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 241 insertions(+), 17 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ecd88be..4f40e08 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -22,10 +22,35 @@
  */
 
 #include "avcodec.h"
+#include "v210dec.h"
 #include "libavutil/bswap.h"
+#include "libavutil/x86/timer.h"
+
+#define READ_PIXELS(a, b, c)         \
+    do {                             \
+        val  = av_le2ne32(*src++);   \
+        *a++ =  val & 0x3FF;         \
+        *b++ = (val >> 10) & 0x3FF;  \
+        *c++ = (val >> 20) & 0x3FF;  \
+    } while (0)
+
+static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+{
+    uint32_t val;
+    int i;
+
+    for( i = 0; i < width-5; i += 6 ){
+        READ_PIXELS(u, y, v);
+        READ_PIXELS(y, u, y);
+        READ_PIXELS(v, y, u);
+        READ_PIXELS(y, v, y);
+    }
+}
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
+    V210DecContext *s = avctx->priv_data;
+
     if (avctx->width & 1) {
         av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
         return -1;
@@ -35,18 +60,37 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     avctx->coded_frame         = avcodec_alloc_frame();
 
+    s->unpack_frame            = v210_planar_unpack_c;
+
+    if (HAVE_MMX)
+        v210_x86_init(s);
+
     return 0;
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                         AVPacket *avpkt)
 {
-    int h, w;
+    V210DecContext *s = avctx->priv_data;
+
+    int h, w, stride, aligned_input;
     AVFrame *pic = avctx->coded_frame;
     const uint8_t *psrc = avpkt->data;
     uint16_t *y, *u, *v;
-    int aligned_width = ((avctx->width + 47) / 48) * 48;
-    int stride = aligned_width * 8 / 3;
+
+    if (s->custom_stride )
+        stride = s->custom_stride;
+    else {
+        int aligned_width = ((avctx->width + 47) / 48) * 48;
+        stride = aligned_width * 8 / 3;
+    }
+
+    aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
+    if (aligned_input != s->aligned_input) {
+        s->aligned_input = aligned_input;
+        if (HAVE_MMX)
+            v210_x86_init(s);
+    }
 
     if (pic->data[0])
         avctx->release_buffer(avctx, pic);
@@ -66,23 +110,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-#define READ_PIXELS(a, b, c)         \
-    do {                             \
-        val  = av_le2ne32(*src++);   \
-        *a++ =  val & 0x3FF;         \
-        *b++ = (val >> 10) & 0x3FF;  \
-        *c++ = (val >> 20) & 0x3FF;  \
-    } while (0)
-
     for (h = 0; h < avctx->height; h++) {
         const uint32_t *src = (const uint32_t*)psrc;
         uint32_t val;
-        for (w = 0; w < avctx->width - 5; w += 6) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-        }
+
+        w = (avctx->width / 6) * 6;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
         if (w < avctx->width - 1) {
             READ_PIXELS(u, y, v);
 
@@ -120,13 +159,29 @@ static av_cold int decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+#define V210DEC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption v210dec_options[] = {
+    {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), FF_OPT_TYPE_INT,
+     {.dbl = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS},
+    {NULL}
+};
+
+static const AVClass v210dec_class = {
+    "V210 Decoder",
+    av_default_item_name,
+    v210dec_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_v210_decoder = {
     .name           = "v210",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = CODEC_ID_V210,
+    .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = CODEC_CAP_DR1,
     .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+    .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
new file mode 100644
index 0000000..48be729
--- /dev/null
+++ b/libavcodec/v210dec.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V210DEC_H
+#define AVCODEC_V210DEC_H
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *av_class;
+    int custom_stride;
+    int aligned_input;
+    void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+} V210DecContext;
+
+void v210_x86_init(V210DecContext *s);
+
+#endif /* AVCODEC_V210DEC_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 3ae63ec..f4783bc 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,6 +39,8 @@ MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER)      += x86/proresdsp-init.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
 MMX-OBJS-$(CONFIG_PRORES_DECODER)      += x86/proresdsp-init.o
 MMX-OBJS-$(CONFIG_DWT)                 += x86/snowdsp_mmx.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
+MMX-OBJS-$(CONFIG_V210_DECODER)        += x86/v210-init.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 YASM-OBJS-$(CONFIG_VP3_DECODER)        += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP5_DECODER)        += x86/vp3dsp.o
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000..4dd6d6d
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000..344bed0
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,85 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm at u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran at kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86inc.asm"
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 2
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1_%2, 5, 5
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM
+v210_planar_unpack unaligned, ssse3
+INIT_AVX
+v210_planar_unpack unaligned, avx
+
+INIT_XMM
+v210_planar_unpack aligned, ssse3
+INIT_AVX
+v210_planar_unpack aligned, avx



More information about the ffmpeg-cvslog mailing list