[FFmpeg-devel] [PATCH] avfilter/vf_stereo3d: add x86 SIMD for anaglyph outputs
James Almer
jamrial at gmail.com
Mon Oct 5 18:21:13 CEST 2015
On 10/5/2015 6:49 AM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm
> new file mode 100644
> index 0000000..269004b
> --- /dev/null
> +++ b/libavfilter/x86/vf_stereo3d.asm
> @@ -0,0 +1,184 @@
> +;*****************************************************************************
> +;* x86-optimized functions for stereo3d filter
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +%if ARCH_X86_64
> +
> +SECTION_RODATA
> +
> +; rgbrgbrgbrgb
> +; rrrrggggbbbb
> +
> +shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1
> +ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1
> +ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1
> +ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal anaglyph, 11, 13, 16, 3*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b
> + movd m10, [ana_matrix_rq+ 0]
> + movd m11, [ana_matrix_rq+ 4]
> + movd m12, [ana_matrix_rq+ 8]
> + movd m13, [ana_matrix_rq+12]
> + movd m14, [ana_matrix_rq+16]
> + movd m15, [ana_matrix_rq+20]
> + pshufd m10, m10, q0000
> + pshufd m11, m11, q0000
> + pshufd m12, m12, q0000
> + pshufd m13, m13, q0000
> + pshufd m14, m14, q0000
> + pshufd m15, m15, q0000
mova m13, [ana_matrix_rq + 0]
movq m15, [ana_matrix_rq + 16]
pshufd m10, m13, q0000
pshufd m11, m13, q1111
pshufd m12, m13, q2222
pshufd m13, m13, q3333
pshufd m14, m15, q0000
pshufd m15, m15, q1111
Will probably be faster.
Also, you're not using m7 anywhere, and m13, m14 and m15 remain
unused after the init code. You could keep four of the coeffs in
them instead of using stack.
More information about the ffmpeg-devel
mailing list