[FFmpeg-devel] [PATCH 2/5] yadif: x86 assembly for 9 to 14-bit samples
James Darnley
james.darnley at gmail.com
Sat Mar 16 21:42:24 CET 2013
These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2). It also avoids emulating the missing double word
instructions on older instruction sets.
Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.
Athlon64:
1809275 decicycles in C, 32718 runs, 50 skips
911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster
495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster
Core2Quad:
921363 decicycles in C, 32756 runs, 12 skips
486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster
293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster
284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster
---
libavfilter/x86/Makefile | 2 +-
libavfilter/x86/vf_yadif_init.c | 21 +++-
libavfilter/x86/yadif-10.asm | 284 +++++++++++++++++++++++++++++++++++++++
3 files changed, 305 insertions(+), 2 deletions(-)
create mode 100644 libavfilter/x86/yadif-10.asm
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index f2656d2..cd97347 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,4 +5,4 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
-YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 406ec47..b7906bc 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -49,6 +49,16 @@ void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
av_cold void ff_yadif_init_x86(YADIFContext *yadif)
{
int cpu_flags = av_get_cpu_flags();
@@ -56,7 +66,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
: yadif->csp->comp[0].depth_minus1 + 1;
#if HAVE_YASM
- if (bit_depth > 8) {
+ if (bit_depth >= 15) {
#if ARCH_X86_32
if (EXTERNAL_MMXEXT(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
@@ -67,6 +77,15 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+ } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
} else {
#if ARCH_X86_32
if (EXTERNAL_MMXEXT(cpu_flags))
diff --git a/libavfilter/x86/yadif-10.asm b/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000..1cf9808
--- /dev/null
+++ b/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,284 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni at gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang at gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+ pabsw %1, %1
+%else
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+%if mmsize == 16
+ psrldq m5, 2
+%else
+ psrlq m5, 16
+%endif
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+%if mmsize == 16
+ psrldq m3, 2
+ psrldq m4, 4
+%else
+ psrlq m3, 16
+ psrlq m4, 32
+%endif
+ paddw m2, m3
+ paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtw m3, m2
+ pminsw m0, m2
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; %macro CHECK2 0
+; paddw m6, [pw_1]
+; psllw m6, 14
+; paddsw m2, m6
+; mova m3, m0
+; pcmpgtw m3, m2
+; pminsw m0, m2
+; pand m5, m3
+; pandn m3, m1
+; por m3, m5
+; mova m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples. The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+ mova m3, m0
+ pcmpgtw m0, m2
+ pand m0, m6
+ mova m6, m0
+ pand m5, m6
+ pand m2, m0
+ pandn m6, m1
+ pandn m0, m3
+ por m6, m5
+ por m0, m2
+ mova m1, m6
+%endmacro
+
+%macro LOAD 2
+ movu m%1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD 0, [curq+t1]
+ LOAD 1, [curq+t0]
+ LOAD 2, [%2]
+ LOAD 3, [%3]
+ mova m4, m3
+ paddw m3, m2
+ psraw m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubw m2, m4
+ PABS m2, m4
+ LOAD 3, [prevq+t1]
+ LOAD 4, [prevq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddw m3, m4
+ psrlw m2, 1
+ psrlw m3, 1
+ pmaxsw m2, m3
+ LOAD 3, [nextq+t1]
+ LOAD 4, [nextq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddw m3, m4
+ psrlw m3, 1
+ pmaxsw m2, m3
+ mova [rsp+48], m2
+
+ paddw m1, m0
+ paddw m0, m0
+ psubw m0, m1
+ psrlw m1, 1
+ PABS m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+%if mmsize == 16
+ mova m3, m2
+ psrldq m3, 4
+%else
+ mova m3, m2
+ psrlq m3, 32
+%endif
+ paddw m0, m2
+ paddw m0, m3
+ psubw m0, [pw_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD 2, [%2+t1*2]
+ LOAD 4, [%3+t1*2]
+ LOAD 3, [%2+t0*2]
+ LOAD 5, [%3+t0*2]
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 1
+ psrlw m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubw m2, m4
+ psubw m3, m7
+ mova m0, m5
+ psubw m5, m4
+ psubw m0, m7
+ mova m4, m2
+ pminsw m2, m3
+ pmaxsw m3, m4
+ pmaxsw m2, m5
+ pminsw m3, m5
+ pmaxsw m2, m0
+ pminsw m3, m0
+ pxor m4, m4
+ pmaxsw m6, m3
+ psubw m4, m2
+ pmaxsw m6, m4
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubw m2, m6
+ paddw m3, m6
+ pmaxsw m1, m2
+ pminsw m1, m3
+
+ movu [dstq], m1
+ add dstq, mmsize-4
+ add prevq, mmsize-4
+ add curq, mmsize-4
+ add nextq, mmsize-4
+ sub DWORD r4m, mmsize/2-2
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+ cmp DWORD wm, 0
+ jle .ret
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
--
1.7.9
More information about the ffmpeg-devel
mailing list