[FFmpeg-cvslog] av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()

Pascal Massimino git at videolan.org
Tue Sep 9 17:06:03 CEST 2014


ffmpeg | branch: master | Pascal Massimino <pascal.massimino at gmail.com> | Tue Sep  9 14:38:58 2014 +0200| [e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db] | committer: Michael Niedermayer

av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()

tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv
MMX: ~30% faster decoding overall
SSE2:~40% faster

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db
---

 libavfilter/vf_idet.c          |   11 ++++---
 libavfilter/vf_idet.h          |    7 ++--
 libavfilter/x86/vf_idet.asm    |   70 ++++++++++++++++++++++++++++++++++++++--
 libavfilter/x86/vf_idet_init.c |   29 +++++++++++++----
 4 files changed, 103 insertions(+), 14 deletions(-)

diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c
index 4416228..22ff494 100644
--- a/libavfilter/vf_idet.c
+++ b/libavfilter/vf_idet.c
@@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c,
     return ret;
 }
 
-static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
+int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
 {
     int x;
     int ret=0;
@@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref)
 
     if (!idet->csp)
         idet->csp = av_pix_fmt_desc_get(link->format);
-    if (idet->csp->comp[0].depth_minus1 / 8 == 1)
-        idet->filter_line = (void*)filter_line_c_16bit;
+    if (idet->csp->comp[0].depth_minus1 / 8 == 1){
+        idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit;
+        if (ARCH_X86)
+            ff_idet_init_x86(idet, 1);
+    }
 
     filter(ctx);
 
@@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx)
     idet->filter_line = ff_idet_filter_line_c;
 
     if (ARCH_X86)
-        ff_idet_init_x86(idet);
+        ff_idet_init_x86(idet, 0);
 
     return 0;
 }
diff --git a/libavfilter/vf_idet.h b/libavfilter/vf_idet.h
index 0550690..c5799fb 100644
--- a/libavfilter/vf_idet.h
+++ b/libavfilter/vf_idet.h
@@ -24,6 +24,8 @@
 
 #define HIST_SIZE 4
 
+typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
+
 typedef enum {
     TFF,
     BFF,
@@ -45,14 +47,15 @@ typedef struct {
     AVFrame *cur;
     AVFrame *next;
     AVFrame *prev;
-    int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w);
+    ff_idet_filter_func filter_line;
 
     const AVPixFmtDescriptor *csp;
 } IDETContext;
 
-void ff_idet_init_x86(IDETContext *idet);
+void ff_idet_init_x86(IDETContext *idet, int for_16b);
 
 /* main fall-back for left-over */
 int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
+int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w);
 
 #endif
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
index 14b16c5..4649cae 100644
--- a/libavfilter/x86/vf_idet.asm
+++ b/libavfilter/x86/vf_idet.asm
@@ -25,8 +25,6 @@
 
 SECTION_TEXT
 
-%if ARCH_X86_32
-
 ; Implementation that does 8-bytes at a time using single-word operations.
 %macro IDET_FILTER_LINE 1
 INIT_MMX %1
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
     RET
 %endmacro
 
+%if ARCH_X86_32
 IDET_FILTER_LINE mmxext
 IDET_FILTER_LINE mmx
 %endif
 
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
+  psubusw   %3, %2, %1
+  psubusw   %1, %2
+  por       %1, %3
+
+  mova      %2, %1
+  punpcklwd %1, m_zero
+  punpckhwd %2, m_zero
+  paddd     %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1   ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+    xor       indexq, indexq
+%define m_zero m1
+%define m_sum  m0
+    pxor      m_sum, m_sum
+    pxor      m_zero, m_zero
+
+.loop_16bit:
+    movu      m2, [bq + indexq * 2]  ; B
+    movu      m3, [aq + indexq * 2]  ; A
+    mova      m6, m2
+    psubusw   m5, m2, m3             ; ba
+
+    movu      m4, [cq + indexq * 2]  ; C
+    add       indexq, %1
+    psubusw   m3, m2                 ; ab
+    CMP       indexd, widthd
+
+    psubusw   m6, m4                 ; bc
+    psubusw   m4, m2                 ; cb
+
+    PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
+    PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
+    paddd          m_sum, m3
+    paddd          m_sum, m5
+    jl        .loop_16bit
+
+    mova      m2, m_sum
+%if mmsize == 16
+    psrldq    m2, 4
+    paddd     m_sum, m2
+    psrldq    m2, 4
+    paddd     m_sum, m2
+    psrldq    m2, 4
+    paddd     m_sum, m2
+%else
+    psrlq     m2, 32
+    paddd     m_sum, m2
+%endif
+    movd      eax, m_sum
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
 ; SSE2 8-bit implementation that does 16-bytes at a time:
+
 INIT_XMM sse2
 cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
     xor       indexq, indexq
diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c
index fb9ad83..1147ca8 100644
--- a/libavfilter/x86/vf_idet_init.c
+++ b/libavfilter/x86/vf_idet_init.c
@@ -23,6 +23,8 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_idet.h"
 
+#if HAVE_YASM
+
 /* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
 #define FUNC_MAIN_DECL(KIND, SPAN)                                        \
 int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,        \
@@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,    \
     return sum;                                                           \
 }
 
-#if HAVE_YASM
+
+#define FUNC_MAIN_DECL_16bit(KIND, SPAN)                                       \
+int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b,     \
+                                     const uint16_t *c, int w);                \
+static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+                                         const uint16_t *c, int w) {           \
+    int sum = 0;                                                               \
+    const int left_over = w & (SPAN - 1);                                      \
+    w -= left_over;                                                            \
+    if (w > 0)                                                                 \
+        sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w);                   \
+    if (left_over > 0)                                                         \
+        sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over);    \
+    return sum;                                                                \
+}
 
 FUNC_MAIN_DECL(sse2, 16)
+FUNC_MAIN_DECL_16bit(sse2, 8)
 #if ARCH_X86_32
 FUNC_MAIN_DECL(mmx, 8)
 FUNC_MAIN_DECL(mmxext, 8)
+FUNC_MAIN_DECL_16bit(mmx, 4)
 #endif
 
 #endif
-
-av_cold void ff_idet_init_x86(IDETContext *idet)
+av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
 {
 #if HAVE_YASM
     const int cpu_flags = av_get_cpu_flags();
 
 #if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
-        idet->filter_line = idet_filter_line_mmx;
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        idet->filter_line = idet_filter_line_mmxext;
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
     }
 #endif // ARCH_x86_32
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        idet->filter_line = idet_filter_line_sse2;
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
     }
 #endif // HAVE_YASM
 }



More information about the ffmpeg-cvslog mailing list