[FFmpeg-devel] [PATCH 2/2] vf_colorspace: x86-64 SIMD (SSE2) optimizations.
Kieran Kunhya
kierank at obe.tv
Thu Apr 7 16:05:52 CEST 2016
On Wed, 6 Apr 2016 at 19:10 Ronald S. Bultje <rsbultje at gmail.com> wrote:
> ---
> libavfilter/colorspacedsp.c | 3 +
> libavfilter/colorspacedsp.h | 3 +
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/colorspacedsp.asm | 1115
> ++++++++++++++++++++++++++++++++++
> libavfilter/x86/colorspacedsp_init.c | 119 ++++
> tests/checkasm/Makefile | 1 +
> tests/checkasm/checkasm.c | 3 +
> tests/checkasm/checkasm.h | 1 +
> tests/checkasm/vf_colorspace.c | 314 ++++++++++
> 9 files changed, 1561 insertions(+)
> create mode 100644 libavfilter/x86/colorspacedsp.asm
> create mode 100644 libavfilter/x86/colorspacedsp_init.c
> create mode 100644 tests/checkasm/vf_colorspace.c
>
> diff --git a/libavfilter/colorspacedsp.c b/libavfilter/colorspacedsp.c
> index 51a7c1d..d4c43c3 100644
> --- a/libavfilter/colorspacedsp.c
> +++ b/libavfilter/colorspacedsp.c
> @@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp)
> init_yuv2yuv_fns(2, 12);
>
> dsp->multiply3x3 = multiply3x3_c;
> +
> + if (ARCH_X86)
> + ff_colorspacedsp_x86_init(dsp);
> }
> diff --git a/libavfilter/colorspacedsp.h b/libavfilter/colorspacedsp.h
> index 3571117..4e70c6c 100644
> --- a/libavfilter/colorspacedsp.h
> +++ b/libavfilter/colorspacedsp.h
> @@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext {
>
> void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp);
>
> +/* internal */
> +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp);
> +
> #endif /* AVFILTER_COLORSPACEDSP_H */
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index ed294e0..4486b79 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -1,5 +1,6 @@
> OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
> OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
> +OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
> OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
> OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
> OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
> @@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) +=
> x86/vf_yadif_init.o
>
> YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
> YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
> +YASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
> YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
> YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
> YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
> diff --git a/libavfilter/x86/colorspacedsp.asm
> b/libavfilter/x86/colorspacedsp.asm
> new file mode 100644
> index 0000000..e536566
> --- /dev/null
> +++ b/libavfilter/x86/colorspacedsp.asm
> @@ -0,0 +1,1115 @@
>
> +;*****************************************************************************
> +;* x86-optimized functions for colorspace filter
> +;*
> +;* Copyright (C) 2016 Ronald S. Bultje <rsbultje at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
>
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_1: times 8 dw 1
> +pw_2: times 8 dw 2
> +pw_4: times 8 dw 4
> +pw_8: times 8 dw 8
> +pw_16: times 8 dw 16
> +pw_64: times 8 dw 64
> +pw_128: times 8 dw 128
> +pw_256: times 8 dw 256
> +pw_512: times 8 dw 512
> +pw_1023: times 8 dw 1023
> +pw_1024: times 8 dw 1024
> +pw_2048: times 8 dw 2048
> +pw_4095: times 8 dw 4095
> +pw_8192: times 8 dw 8192
> +pw_16384: times 8 dw 16384
> +
> +pd_1: times 4 dd 1
> +pd_2: times 4 dd 2
> +pd_128: times 4 dd 128
> +pd_512: times 4 dd 512
> +pd_2048: times 4 dd 2048
> +pd_8192: times 4 dd 8192
> +pd_32768: times 4 dd 32768
> +pd_131072: times 4 dd 131072
>
>
Don't we have these defined somewhere?
> +SECTION .text
> +
> +; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t
> yuv_out_stride[3],
> +; uint8_t *yuv_in[3], ptrdiff_t
> yuv_in_stride[3],
> +; int w, int h, const int16_t
> yuv2yuv_coeffs[3][3][8],
> +; const int16_t yuv_offset[2][8])
> +
> +%if ARCH_X86_64
> +%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz),
> log2_chroma_h (vert)
> +
> +%assign %%sh (14 + %1 - %2)
> +%assign %%rnd (1 << (%%sh - 1))
> +%assign %%uvinoff (128 << (%1 - 8))
> +%assign %%uvoutoff (128 << (%2 - 8))
> +%if %3 == 0
> +%assign %%ss 444
> +%elif %4 == 0
> +%assign %%ss 422
> +%else ; %4 == 1
> +%assign %%ss 420
> +%endif ; %3/%4
> +%if %2 != 8
> +%assign %%maxval (1 << %2) - 1
> +%endif ; %2 != 8
> +
> +%assign %%ypsh %%sh - 1
> +%if %%ypsh > 14
> +%assign %%yoffsh %%ypsh - 13
> +%assign %%ypsh 14
> +%else
> +%assign %%yoffsh 1
> +%endif
> +%assign %%yprnd (1 << (%%yoffsh - 1))
> +%assign %%ypmul (1 << %%ypsh)
> +
> +cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
> + yo, yos, yi, yis, w, h, c, yoff, ui,
> vi, uo, vo
> +%if %3 == 1
> + inc wd
> + sar wd, 1
> +%if %4 == 1
> + inc hd
> + sar hd, 1
> +%endif ; %4 == 1
> +%endif ; %3 == 1
> + mov [rsp+3*mmsize+0], wd
> + mov [rsp+3*mmsize+4], hd
> +
> + mova m10, [cq]
> + pxor m11, m11
> + mova m12, [pd_ %+ %%uvoutoff]
> + pslld m12, %%sh
> + paddd m12, [pd_ %+ %%rnd]
> + mova m13, [pw_ %+ %%uvinoff]
> + mova m14, [yoffq+ 0] ; y_off_in
> + mova m15, [yoffq+16] ; y_off_out
> +%if %%yoffsh != 0
> + psllw m15, %%yoffsh
> +%endif
> + paddw m15, [pw_ %+ %%yprnd]
> + punpcklwd m10, m15
> + mova m15, [pw_ %+ %%ypmul]
> + movh m0, [cq+1*16] ; cyu
> + movh m1, [cq+2*16] ; cyv
> + movh m2, [cq+4*16] ; cuu
> + movh m3, [cq+5*16] ; cuv
> + movh m4, [cq+7*16] ; cvu
> + movh m5, [cq+8*16] ; cvv
> + punpcklwd m0, m1
> + punpcklwd m2, m3
> + punpcklwd m4, m5
> + mova [rsp+0*mmsize], m0
> + mova [rsp+1*mmsize], m2
> + mova [rsp+2*mmsize], m4
> +
> + DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x,
> tmp
> +
> + mov uiq, [yiq+gprsize*1]
> + mov viq, [yiq+gprsize*2]
> + mov yiq, [yiq+gprsize*0]
> + mov uoq, [yoq+gprsize*1]
> + mov voq, [yoq+gprsize*2]
> + mov yoq, [yoq+gprsize*0]
> + mov uisq, [yisq+gprsize*1]
> + mov visq, [yisq+gprsize*2]
> + mov yisq, [yisq+gprsize*0]
> + mov uosq, [yosq+gprsize*1]
> + mov vosq, [yosq+gprsize*2]
> + mov yosq, [yosq+gprsize*0]
> +
> +.loop_v:
> + xor xq, xq
> +
> +.loop_h:
> +%if %4 == 1
> + lea tmpq, [yiq+yisq]
> +%endif ; %4 == 1
> +%if %1 == 8
> + movu m0, [yiq+xq*(1<<%3)] ; y00/01
> +%if %4 == 1
> + movu m2, [tmpq+xq*2] ; y10/11
> +%endif ; %4 == 1
> +%if %3 == 1
> + movh m4, [uiq+xq] ; u
> + movh m5, [viq+xq] ; v
> +%else ; %3 != 1
> + movu m4, [uiq+xq] ; u
> + movu m5, [viq+xq] ; v
> +%endif ; %3 ==/!= 1
> + punpckhbw m1, m0, m11
> + punpcklbw m0, m11
> +%if %4 == 1
> + punpckhbw m3, m2, m11
> + punpcklbw m2, m11
> +%endif ; %4 == 1
> +%if %3 == 0
> + punpckhbw m2, m4, m11
> + punpckhbw m3, m5, m11
> +%endif ; %3 == 0
> + punpcklbw m4, m11
> + punpcklbw m5, m11
> +%else ; %1 != 8
> + movu m0, [yiq+xq*(2<<%3)] ; y00/01
> + movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
> +%if %4 == 1
> + movu m2, [tmpq+xq*4] ; y10/11
> + movu m3, [tmpq+xq*4+mmsize] ; y10/11
> +%endif ; %4 == 1
> + movu m4, [uiq+xq*2] ; u
> + movu m5, [viq+xq*2] ; v
> +%if %3 == 0
> + movu m2, [uiq+xq*2+mmsize]
> + movu m3, [viq+xq*2+mmsize]
> +%endif ; %3 == 0
> +%endif ; %1 ==/!= 8
> + psubw m0, m14
> + psubw m1, m14
> +%if %4 == 1
> + psubw m2, m14
> + psubw m3, m14
> +%endif ; %4 == 1
> + psubw m4, m13
> + psubw m5, m13
> +%if %3 == 0
> + psubw m2, m13
> + psubw m3, m13
> +%endif ; %3 == 0
> +
> + SBUTTERFLY wd, 4, 5, 6
> + pmaddwd m6, m4, [rsp+1*mmsize]
> + pmaddwd m7, m5, [rsp+1*mmsize]
> +%if %3 == 0
> + SBUTTERFLY wd, 2, 3, 8
> + pmaddwd m8, m2, [rsp+1*mmsize]
> + pmaddwd m9, m3, [rsp+1*mmsize]
> +%else ; %3 != 0
> + pmaddwd m8, m4, [rsp+2*mmsize]
> + pmaddwd m9, m5, [rsp+2*mmsize]
> +%endif
> + paddd m6, m12
> + paddd m7, m12
> + paddd m8, m12
> + paddd m9, m12
> + psrad m6, %%sh
> + psrad m7, %%sh
> + psrad m8, %%sh
> + psrad m9, %%sh
> + packssdw m6, m7
> + packssdw m8, m9
> +%if %2 == 8
> + packuswb m6, m8
> +%if %3 == 0
> + movu [uoq+xq], m6
> +%else ; %3 != 0
> + movh [uoq+xq], m6
> + movhps [voq+xq], m6
> +%endif ; %3 ==/!= 0
> +%else ; %2 != 8
> + pmaxsw m6, m11
> + pmaxsw m8, m11
> + pminsw m6, [pw_ %+ %%maxval]
> + pminsw m8, [pw_ %+ %%maxval]
>
CLIPW
> + movu [uoq+xq*2], m6
> +%if %3 == 0
> + movu [uoq+xq*2+mmsize], m8
> +%else ; %3 != 0
> + movu [voq+xq*2], m8
> +%endif ; %3 ==/!= 0
> +%endif ; %2 ==/!= 8
> +
> +%if %3 == 0
> + pmaddwd m6, m4, [rsp+2*mmsize]
> + pmaddwd m7, m5, [rsp+2*mmsize]
> + pmaddwd m8, m2, [rsp+2*mmsize]
> + pmaddwd m9, m3, [rsp+2*mmsize]
> + paddd m6, m12
> + paddd m7, m12
> + paddd m8, m12
> + paddd m9, m12
> + psrad m6, %%sh
> + psrad m7, %%sh
> + psrad m8, %%sh
> + psrad m9, %%sh
> + packssdw m6, m7
> + packssdw m8, m9
> +%if %2 == 8
> + packuswb m6, m8
> + movu [voq+xq], m6
> +%else ; %2 != 8
> + pmaxsw m6, m11
> + pmaxsw m8, m11
> + pminsw m6, [pw_ %+ %%maxval]
> + pminsw m8, [pw_ %+ %%maxval]
>
CLIPW (and a few other places)
> + movu [voq+xq*2], m6
> + movu [voq+xq*2+mmsize], m8
> +%endif ; %2 ==/!= 8
> +%endif ; %3 == 0
> +
> + pmaddwd m4, [rsp+0*mmsize]
> + pmaddwd m5, [rsp+0*mmsize] ; uv_val
> +%if %3 == 0
> + pmaddwd m2, [rsp+0*mmsize]
> + pmaddwd m3, [rsp+0*mmsize]
> +%endif ; %3 == 0
> +
> + ; unpack y pixels with m15 (shifted round + offset), then multiply
> + ; by m10, add uv pixels, and we're done!
> +%if %3 == 1
> + punpckhdq m8, m4, m4
> + punpckldq m4, m4
> + punpckhdq m9, m5, m5
> + punpckldq m5, m5
> +%else ; %3 != 1
> + SWAP 8, 5, 2
> + SWAP 3, 9
> +%endif ; %3 ==/!= 1
> +%if %4 == 1
> + punpckhwd m6, m2, m15
> + punpcklwd m2, m15
> + punpckhwd m7, m3, m15
> + punpcklwd m3, m15
> + pmaddwd m2, m10
> + pmaddwd m6, m10
> + pmaddwd m3, m10
> + pmaddwd m7, m10
> + paddd m2, m4
> + paddd m6, m8
> + paddd m3, m5
> + paddd m7, m9
> + psrad m2, %%sh
> + psrad m6, %%sh
> + psrad m3, %%sh
> + psrad m7, %%sh
> + packssdw m2, m6
> + packssdw m3, m7
> +
> + lea tmpq, [yoq+yosq]
> +%if %2 == 8
> + packuswb m2, m3
> + movu [tmpq+xq*2], m2
> +%else ; %2 != 8
> + pmaxsw m2, m11
> + pmaxsw m3, m11
> + pminsw m2, [pw_ %+ %%maxval]
> + pminsw m3, [pw_ %+ %%maxval]
> + movu [tmpq+xq*4], m2
> + movu [tmpq+xq*4+mmsize], m3
> +%endif ; %2 ==/!= 8
> +%endif ; %4 == 1
> +
> + punpckhwd m6, m0, m15
> + punpcklwd m0, m15
> + punpckhwd m7, m1, m15
> + punpcklwd m1, m15
> + pmaddwd m0, m10
> + pmaddwd m6, m10
> + pmaddwd m1, m10
> + pmaddwd m7, m10
> + paddd m0, m4
> + paddd m6, m8
> + paddd m1, m5
> + paddd m7, m9
> + psrad m0, %%sh
> + psrad m6, %%sh
> + psrad m1, %%sh
> + psrad m7, %%sh
> + packssdw m0, m6
> + packssdw m1, m7
> +
> +%if %2 == 8
> + packuswb m0, m1
> + movu [yoq+xq*(1<<%3)], m0
> +%else ; %2 != 8
> + pmaxsw m0, m11
> + pmaxsw m1, m11
> + pminsw m0, [pw_ %+ %%maxval]
> + pminsw m1, [pw_ %+ %%maxval]
> + movu [yoq+xq*(2<<%3)], m0
> + movu [yoq+xq*(2<<%3)+mmsize], m1
> +%endif ; %2 ==/!= 8
> +
> + add xq, mmsize >> %3
> + cmp xd, dword [rsp+3*mmsize+0]
> + jl .loop_h
> +
> +%if %4 == 1
> + lea yiq, [yiq+yisq*2]
> + lea yoq, [yoq+yosq*2]
> +%else ; %4 != 1
> + add yiq, yisq
> + add yoq, yosq
> +%endif ; %4 ==/!= 1
> + add uiq, uisq
> + add viq, visq
> + add uoq, uosq
> + add voq, vosq
> + dec dword [rsp+3*mmsize+4]
> + jg .loop_v
> +
> + RET
> +%endmacro
> +
> +%macro YUV2YUV_FNS 2 ; ss_w, ss_h
> +YUV2YUV_FN 8, 8, %1, %2
> +YUV2YUV_FN 10, 8, %1, %2
> +YUV2YUV_FN 12, 8, %1, %2
> +YUV2YUV_FN 8, 10, %1, %2
> +YUV2YUV_FN 10, 10, %1, %2
> +YUV2YUV_FN 12, 10, %1, %2
> +YUV2YUV_FN 8, 12, %1, %2
> +YUV2YUV_FN 10, 12, %1, %2
> +YUV2YUV_FN 12, 12, %1, %2
> +%endmacro
> +
> +INIT_XMM sse2
> +YUV2YUV_FNS 0, 0
> +YUV2YUV_FNS 1, 0
> +YUV2YUV_FNS 1, 1
> +
> +; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
> +; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
> +; int w, int h, const int16_t
> yuv2rgb_coeffs[3][3][8],
> +; const int16_t yuv_offset[8])
> +%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
> +%assign %%sh (%1 - 1)
> +%assign %%rnd (1 << (%%sh - 1))
> +%assign %%uvoff (1 << (%1 - 1))
> +%if %2 == 0
> +%assign %%ss 444
> +%elif %3 == 0
> +%assign %%ss 422
> +%else ; %3 == 1
> +%assign %%ss 420
> +%endif ; %2/%3
> +
> +cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
> + rgb, rgbs, yuv, yuvs, ww, h, c, yoff
> +%if %2 == 1
> + inc wwd
> + sar wwd, 1
> +%endif ; %2 == 1
> +%if %3 == 1
> + inc hd
> + sar hd, 1
> +%endif ; %3 == 1
> + pxor m11, m11
> + mova m15, [yoffq] ; yoff
> + movh m14, [cq+ 0] ; cy
> + movh m10, [cq+ 32] ; crv
> + movh m13, [cq+112] ; cbu
> + movh m12, [cq+ 64] ; cgu
> + movh m9, [cq+ 80] ; cgv
> + punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd
> + punpcklwd m13, m11 ; cbu, 0
> + punpcklwd m11, m10 ; 0, crv
> + punpcklwd m12, m9 ; cgu, cgv
> + mova [rsp+0*mmsize], m11
> + mova [rsp+1*mmsize], m12
> + mova [rsp+2*mmsize], m13
> + mova [rsp+3*mmsize], m14
> + pxor m14, m14
> +
> + DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
> +
> + mov gq, [rq+1*gprsize]
> + mov bq, [rq+2*gprsize]
> + mov rq, [rq+0*gprsize]
> + mov uq, [yq+1*gprsize]
> + mov vq, [yq+2*gprsize]
> + mov yq, [yq+0*gprsize]
> + mov usq, [ysq+1*gprsize]
> + mov vsq, [ysq+2*gprsize]
> + mov ysq, [ysq+0*gprsize]
> +
> +.loop_v:
> + xor xq, xq
> +
> +.loop_h:
> +%if %3 == 1
> + lea tmpq, [yq+ysq]
> +%endif ; %3 == 1
> +%if %1 == 8
> + movu m0, [yq+xq*(1<<%2)]
> +%if %3 == 1
> + movu m2, [tmpq+xq*2]
> +%endif ; %3 == 1
> +%if %2 == 1
> + movh m4, [uq+xq]
> + movh m5, [vq+xq]
> +%else ; %2 != 1
> + movu m4, [uq+xq]
> + movu m5, [vq+xq]
> +%endif ; %2 ==/!= 1
> + punpckhbw m1, m0, m14
> + punpcklbw m0, m14
> +%if %3 == 1
> + punpckhbw m3, m2, m14
> + punpcklbw m2, m14
> +%endif ; %3 == 1
> +%if %2 == 0
> + punpckhbw m2, m4, m14
> + punpckhbw m3, m5, m14
> +%endif ; %2 == 0
> + punpcklbw m4, m14
> + punpcklbw m5, m14
> +%else ; %1 != 8
> + movu m0, [yq+xq*(2<<%2)]
> + movu m1, [yq+xq*(2<<%2)+mmsize]
> +%if %3 == 1
> + movu m2, [tmpq+xq*4]
> + movu m3, [tmpq+xq*4+mmsize]
> +%endif ; %3 == 1
> + movu m4, [uq+xq*2]
> + movu m5, [vq+xq*2]
> +%if %2 == 0
> + movu m2, [uq+xq*2+mmsize]
> + movu m3, [vq+xq*2+mmsize]
> +%endif ; %2 == 0
> +%endif ; %1 ==/!= 8
> + psubw m0, m15
> + psubw m1, m15
> +%if %3 == 1
> + psubw m2, m15
> + psubw m3, m15
> +%endif ; %3 == 1
> + psubw m4, [pw_ %+ %%uvoff]
> + psubw m5, [pw_ %+ %%uvoff]
> + SBUTTERFLY wd, 4, 5, 6
> +%if %2 == 0
> + psubw m2, [pw_ %+ %%uvoff]
> + psubw m3, [pw_ %+ %%uvoff]
> + SBUTTERFLY wd, 2, 3, 6
> +%endif ; %2 == 0
> +
> + ; calculate y+rnd full-resolution [0-3,6-9]
> + punpckhwd m6, m0, [pw_1] ; y, 1
> + punpcklwd m0, [pw_1] ; y, 1
> + punpckhwd m7, m1, [pw_1] ; y, 1
> + punpcklwd m1, [pw_1] ; y, 1
> + pmaddwd m0, [rsp+3*mmsize]
> + pmaddwd m6, [rsp+3*mmsize]
> + pmaddwd m1, [rsp+3*mmsize]
> + pmaddwd m7, [rsp+3*mmsize]
> +%if %3 == 1
> + punpckhwd m8, m2, [pw_1] ; y, 1
> + punpcklwd m2, [pw_1] ; y, 1
> + punpckhwd m9, m3, [pw_1] ; y, 1
> + punpcklwd m3, [pw_1] ; y, 1
> + pmaddwd m2, [rsp+3*mmsize]
> + pmaddwd m8, [rsp+3*mmsize]
> + pmaddwd m3, [rsp+3*mmsize]
> + pmaddwd m9, [rsp+3*mmsize]
> + mova [rsp+4*mmsize], m2
> + mova [rsp+5*mmsize], m8
> + mova [rsp+6*mmsize], m3
> + mova [rsp+7*mmsize], m9
> +%endif ; %3 == 1
> +
> + ; calculate r offsets (un-subsampled, then duplicate)
> + pmaddwd m10, m4, [rsp+0*mmsize]
> +%if %2 == 1
> + pmaddwd m12, m5, [rsp+0*mmsize]
> + punpckhdq m11, m10, m10
> + punpckldq m10, m10
> + punpckhdq m13, m12, m12
> + punpckldq m12, m12
> +%else ; %2 != 1
> + pmaddwd m11, m5, [rsp+0*mmsize]
> + pmaddwd m12, m2, [rsp+0*mmsize]
> + pmaddwd m13, m3, [rsp+0*mmsize]
> +%endif ; %2 ==/!= 1
> +%if %3 == 1
> + paddd m2, m10, [rsp+4*mmsize]
> + paddd m3, m11, [rsp+5*mmsize]
> + paddd m8, m12, [rsp+6*mmsize]
> + paddd m9, m13, [rsp+7*mmsize]
> +%endif
> + paddd m10, m0
> + paddd m11, m6
> + paddd m12, m1
> + paddd m13, m7
> +%if %3 == 1
> + psrad m2, %%sh
> + psrad m3, %%sh
> + psrad m8, %%sh
> + psrad m9, %%sh
> +%endif ; %3 == 1
> + psrad m10, %%sh
> + psrad m11, %%sh
> + psrad m12, %%sh
> + psrad m13, %%sh
> +%if %3 == 1
> + lea tmpq, [rq+rgbsq*2]
> + packssdw m2, m3
> + packssdw m8, m9
> + mova [tmpq+xq*4], m2
> + mova [tmpq+xq*4+mmsize], m8
> +%endif ; %3 == 1
> + packssdw m10, m11
> + packssdw m12, m13
> + mova [rq+xq*(2 << %2)], m10
> + mova [rq+xq*(2 << %2)+mmsize], m12
> +
> + ; calculate g offsets (un-subsampled, then duplicate)
> + pmaddwd m10, m4, [rsp+1*mmsize]
> +%if %2 == 1
> + pmaddwd m12, m5, [rsp+1*mmsize]
> + punpckhdq m11, m10, m10
> + punpckldq m10, m10
> + punpckhdq m13, m12, m12
> + punpckldq m12, m12
> +%else ; %2 != 1
> + pmaddwd m11, m5, [rsp+1*mmsize]
> + pmaddwd m12, m2, [rsp+1*mmsize]
> + pmaddwd m13, m3, [rsp+1*mmsize]
> +%endif ; %2 ==/!= 1
> +%if %3 == 1
> + paddd m2, m10, [rsp+4*mmsize]
> + paddd m3, m11, [rsp+5*mmsize]
> + paddd m8, m12, [rsp+6*mmsize]
> + paddd m9, m13, [rsp+7*mmsize]
> +%endif ; %3 == 1
> + paddd m10, m0
> + paddd m11, m6
> + paddd m12, m1
> + paddd m13, m7
> +%if %3 == 1
> + psrad m2, %%sh
> + psrad m3, %%sh
> + psrad m8, %%sh
> + psrad m9, %%sh
> +%endif ; %3 == 1
> + psrad m10, %%sh
> + psrad m11, %%sh
> + psrad m12, %%sh
> + psrad m13, %%sh
> +%if %3 == 1
> + lea tmpq, [gq+rgbsq*2]
> + packssdw m2, m3
> + packssdw m8, m9
> + mova [tmpq+xq*4], m2
> + mova [tmpq+xq*4+mmsize], m8
> +%endif ; %3 == 1
> + packssdw m10, m11
> + packssdw m12, m13
> + mova [gq+xq*(2 << %2)], m10
> + mova [gq+xq*(2 << %2)+mmsize], m12
> +
> + ; calculate b offsets (un-subsampled, then duplicate)
> + pmaddwd m4, [rsp+2*mmsize]
> + pmaddwd m5, [rsp+2*mmsize]
> +%if %2 == 1
> + punpckhdq m2, m4, m4
> + punpckldq m4, m4
> + punpckhdq m3, m5, m5
> + punpckldq m5, m5
> +%else ; %2 != 1
> + pmaddwd m2, [rsp+2*mmsize]
> + pmaddwd m3, [rsp+2*mmsize]
> + SWAP 2, 5
> +%endif ; %2 ==/!= 1
> + paddd m0, m4
> + paddd m6, m2
> + paddd m1, m5
> + paddd m7, m3
> +%if %3 == 1
> + paddd m4, [rsp+4*mmsize]
> + paddd m2, [rsp+5*mmsize]
> + paddd m5, [rsp+6*mmsize]
> + paddd m3, [rsp+7*mmsize]
> +%endif ; %3 == 1
> + psrad m0, %%sh
> + psrad m6, %%sh
> + psrad m1, %%sh
> + psrad m7, %%sh
> +%if %3 == 1
> + psrad m4, %%sh
> + psrad m2, %%sh
> + psrad m5, %%sh
> + psrad m3, %%sh
> +%endif ; %3 == 1
> + packssdw m0, m6
> + packssdw m1, m7
> + movu [bq+xq*(2 << %2)], m0
> + movu [bq+xq*(2 << %2)+mmsize], m1
> +%if %3 == 1
> + lea tmpq, [bq+rgbsq*2]
> + packssdw m4, m2
> + packssdw m5, m3
> + movu [tmpq+xq*4], m4
> + movu [tmpq+xq*4+mmsize], m5
> +%endif ; %3 == 1
> +
> + add xd, mmsize >> %2
> + cmp xd, wwd
> + jl .loop_h
> +
> + lea rq, [rq+rgbsq*(2 << %3)]
> + lea gq, [gq+rgbsq*(2 << %3)]
> + lea bq, [bq+rgbsq*(2 << %3)]
> +%if %3 == 1
> + lea yq, [yq+ysq*2]
> +%else ; %3 != 0
> + add yq, ysq
> +%endif ; %3 ==/!= 1
> + add uq, usq
> + add vq, vsq
> + dec hd
> + jg .loop_v
> +
> + RET
> +%endmacro
> +
> +%macro YUV2RGB_FNS 2
> +YUV2RGB_FN 8, %1, %2
> +YUV2RGB_FN 10, %1, %2
> +YUV2RGB_FN 12, %1, %2
> +%endmacro
> +
> +INIT_XMM sse2
> +YUV2RGB_FNS 0, 0
> +YUV2RGB_FNS 1, 0
> +YUV2RGB_FNS 1, 1
> +
> +%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
> +%assign %%sh 29 - %1
> +%assign %%rnd (1 << (%%sh - 15))
> +%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
> +%if %1 != 8
> +%assign %%maxval ((1 << %1) - 1)
> +%endif ; %1 != 8
> +%if %2 == 0
> +%assign %%ss 444
> +%elif %3 == 0
> +%assign %%ss 422
> +%else ; %3 == 1
> +%assign %%ss 420
> +%endif ; %2/%3
> +
> +cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
> + yuv, yuvs, rgb, rgbs, ww, h, c, off
> +%if %2 == 1
> + inc wwd
> + sar wwd, 1
> +%endif ; %2 == 1
> +%if %3 == 1
> + inc hd
> + sar hd, 1
> +%endif ; %3 == 1
> +
> + ; prepare coeffs
> + movh m8, [offq]
> + movh m9, [pw_ %+ %%uvrnd]
> + psllw m8, %%sh - 14
> + paddw m9, [pw_ %+ %%rnd]
> + paddw m8, [pw_ %+ %%rnd]
> + movh m0, [cq+ 0]
> + movh m1, [cq+ 16]
> + movh m2, [cq+ 32]
> + movh m3, [cq+ 48]
> + movh m4, [cq+ 64]
> + movh m5, [cq+ 80]
> + movh m6, [cq+112]
> + movh m7, [cq+128]
> + punpcklwd m0, m1
> + punpcklwd m2, m8
> + punpcklwd m3, m4
> + punpcklwd m4, m5, m9
> + punpcklwd m5, m6
> + punpcklwd m7, m9
> +
> + mova [rsp+0*mmsize], m0 ; cry, cgy
> + mova [rsp+1*mmsize], m2 ; cby, off + rnd
> + mova [rsp+2*mmsize], m3 ; cru, cgu
> + mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd
> + mova [rsp+4*mmsize], m5 ; cburv, cgv
> + mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd
> +
> +
> + DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
> + mov gq, [rq+gprsize*1]
> + mov bq, [rq+gprsize*2]
> + mov rq, [rq+gprsize*0]
> + mov uq, [yq+gprsize*1]
> + mov vq, [yq+gprsize*2]
> + mov yq, [yq+gprsize*0]
> + mov usq, [ysq+gprsize*1]
> + mov vsq, [ysq+gprsize*2]
> + mov ysq, [ysq+gprsize*0]
> +
> + pxor m15, m15
> +.loop_v:
> + xor xd, xd
> +
> +.loop_h:
> + ; top line y
> + mova m0, [rq+xq*(2<<%2)]
> + mova m3, [rq+xq*(2<<%2)+mmsize]
> + mova m1, [gq+xq*(2<<%2)]
> + mova m4, [gq+xq*(2<<%2)+mmsize]
> + mova m2, [bq+xq*(2<<%2)]
> + mova m5, [bq+xq*(2<<%2)+mmsize]
> +
> + punpcklwd m6, m0, m1
> + punpckhwd m7, m0, m1
> + punpcklwd m8, m3, m4
> + punpckhwd m9, m3, m4
> + punpcklwd m10, m2, [pw_16384]
> + punpckhwd m11, m2, [pw_16384]
> + punpcklwd m12, m5, [pw_16384]
> + punpckhwd m13, m5, [pw_16384]
> +
> + pmaddwd m6, [rsp+0*mmsize]
> + pmaddwd m7, [rsp+0*mmsize]
> + pmaddwd m8, [rsp+0*mmsize]
> + pmaddwd m9, [rsp+0*mmsize]
> + pmaddwd m10, [rsp+1*mmsize]
> + pmaddwd m11, [rsp+1*mmsize]
> + pmaddwd m12, [rsp+1*mmsize]
> + pmaddwd m13, [rsp+1*mmsize]
> + paddd m6, m10
> + paddd m7, m11
> + paddd m8, m12
> + paddd m9, m13
> + psrad m6, %%sh
> + psrad m7, %%sh
> + psrad m8, %%sh
> + psrad m9, %%sh
> + packssdw m6, m7
> + packssdw m8, m9
> +%if %1 == 8
> + packuswb m6, m8
> + movu [yq+xq*(1<<%2)], m6
> +%else
> + pminsw m6, [pw_ %+ %%maxval]
> + pminsw m8, [pw_ %+ %%maxval]
> + pmaxsw m6, m15
> + pmaxsw m8, m15
> + movu [yq+xq*(2<<%2)], m6
> + movu [yq+xq*(2<<%2)+mmsize], m8
> +%endif
> +
> +%if %2 == 1
> + ; subsampling cached data
> + pmaddwd m0, [pw_1]
> + pmaddwd m1, [pw_1]
> + pmaddwd m2, [pw_1]
> + pmaddwd m3, [pw_1]
> + pmaddwd m4, [pw_1]
> + pmaddwd m5, [pw_1]
> +
> +%if %3 == 1
> + ; bottom line y, r/g portion only
> + lea tmpq, [rgbsq+xq*2]
> + mova m6, [rq+tmpq*2]
> + mova m9, [rq+tmpq*2+mmsize]
> + mova m7, [gq+tmpq*2]
> + mova m10, [gq+tmpq*2+mmsize]
> + mova m8, [bq+tmpq*2]
> + mova m11, [bq+tmpq*2+mmsize]
> +
> + punpcklwd m12, m6, m7
> + punpckhwd m13, m6, m7
> + punpcklwd m14, m9, m10
> + punpckhwd m15, m9, m10
> +
> + ; release two more registers
> + pmaddwd m6, [pw_1]
> + pmaddwd m7, [pw_1]
> + pmaddwd m9, [pw_1]
> + pmaddwd m10, [pw_1]
> + paddd m0, m6
> + paddd m3, m9
> + paddd m1, m7
> + paddd m4, m10
> +
> + ; bottom line y, b/rnd portion only
> + punpcklwd m6, m8, [pw_16384]
> + punpckhwd m7, m8, [pw_16384]
> + punpcklwd m9, m11, [pw_16384]
> + punpckhwd m10, m11, [pw_16384]
> +
> + pmaddwd m12, [rsp+0*mmsize]
> + pmaddwd m13, [rsp+0*mmsize]
> + pmaddwd m14, [rsp+0*mmsize]
> + pmaddwd m15, [rsp+0*mmsize]
> + pmaddwd m6, [rsp+1*mmsize]
> + pmaddwd m7, [rsp+1*mmsize]
> + pmaddwd m9, [rsp+1*mmsize]
> + pmaddwd m10, [rsp+1*mmsize]
> + paddd m12, m6
> + paddd m13, m7
> + paddd m14, m9
> + paddd m15, m10
> + psrad m12, %%sh
> + psrad m13, %%sh
> + psrad m14, %%sh
> + psrad m15, %%sh
> + packssdw m12, m13
> + packssdw m14, m15
> + lea tmpq, [yq+ysq]
> +%if %1 == 8
> + packuswb m12, m14
> + movu [tmpq+xq*2], m12
> +%else
> + pxor m15, m15
> + pminsw m12, [pw_ %+ %%maxval]
> + pminsw m14, [pw_ %+ %%maxval]
> + pmaxsw m12, m15
> + pmaxsw m14, m15
> + movu [tmpq+xq*4], m12
> + movu [tmpq+xq*4+mmsize], m14
> +%endif
> +
> + ; complete subsampling of r/g/b pixels for u/v
> + pmaddwd m8, [pw_1]
> + pmaddwd m11, [pw_1]
> + paddd m2, m8
> + paddd m5, m11
> + paddd m0, [pd_2]
> + paddd m1, [pd_2]
> + paddd m2, [pd_2]
> + paddd m3, [pd_2]
> + paddd m4, [pd_2]
> + paddd m5, [pd_2]
> + psrad m0, 2
> + psrad m1, 2
> + psrad m2, 2
> + psrad m3, 2
> + psrad m4, 2
> + psrad m5, 2
> +%else ; %3 != 1
> + paddd m0, [pd_1]
> + paddd m1, [pd_1]
> + paddd m2, [pd_1]
> + paddd m3, [pd_1]
> + paddd m4, [pd_1]
> + paddd m5, [pd_1]
> + psrad m0, 1
> + psrad m1, 1
> + psrad m2, 1
> + psrad m3, 1
> + psrad m4, 1
> + psrad m5, 1
> +%endif ; %3 ==/!= 1
> + packssdw m0, m3
> + packssdw m1, m4
> + packssdw m2, m5
> +%endif ; %2 == 1
> +
> + ; convert u/v pixels
> + SBUTTERFLY wd, 0, 1, 6
> + punpckhwd m6, m2, [pw_16384]
> + punpcklwd m2, [pw_16384]
> +
> + pmaddwd m7, m0, [rsp+2*mmsize]
> + pmaddwd m8, m1, [rsp+2*mmsize]
> + pmaddwd m9, m2, [rsp+3*mmsize]
> + pmaddwd m10, m6, [rsp+3*mmsize]
> + pmaddwd m0, [rsp+4*mmsize]
> + pmaddwd m1, [rsp+4*mmsize]
> + pmaddwd m2, [rsp+5*mmsize]
> + pmaddwd m6, [rsp+5*mmsize]
> + paddd m7, m9
> + paddd m8, m10
> + paddd m0, m2
> + paddd m1, m6
> + psrad m7, %%sh
> + psrad m8, %%sh
> + psrad m0, %%sh
> + psrad m1, %%sh
> + packssdw m7, m8
> + packssdw m0, m1
> +%if %2 == 1
> +%if %1 == 8
> + packuswb m7, m0
> + movh [uq+xq], m7
> + movhps [vq+xq], m7
> +%else
> + pminsw m7, [pw_ %+ %%maxval]
> + pminsw m0, [pw_ %+ %%maxval]
> + pmaxsw m7, m15
> + pmaxsw m0, m15
> + movu [uq+xq*2], m7
> + movu [vq+xq*2], m0
> +%endif
> +%else ; %2 != 1
> + ; second set of u/v pixels
> + SBUTTERFLY wd, 3, 4, 6
> + punpckhwd m6, m5, [pw_16384]
> + punpcklwd m5, [pw_16384]
> +
> + pmaddwd m8, m3, [rsp+2*mmsize]
> + pmaddwd m9, m4, [rsp+2*mmsize]
> + pmaddwd m10, m5, [rsp+3*mmsize]
> + pmaddwd m11, m6, [rsp+3*mmsize]
> + pmaddwd m3, [rsp+4*mmsize]
> + pmaddwd m4, [rsp+4*mmsize]
> + pmaddwd m5, [rsp+5*mmsize]
> + pmaddwd m6, [rsp+5*mmsize]
> + paddd m8, m10
> + paddd m9, m11
> + paddd m3, m5
> + paddd m4, m6
> + psrad m8, %%sh
> + psrad m9, %%sh
> + psrad m3, %%sh
> + psrad m4, %%sh
> + packssdw m8, m9
> + packssdw m3, m4
> +
> +%if %1 == 8
> + packuswb m7, m8
> + packuswb m0, m3
> + movu [uq+xq], m7
> + movu [vq+xq], m0
> +%else
> + pminsw m7, [pw_ %+ %%maxval]
> + pminsw m0, [pw_ %+ %%maxval]
> + pminsw m8, [pw_ %+ %%maxval]
> + pminsw m3, [pw_ %+ %%maxval]
> + pmaxsw m7, m15
> + pmaxsw m0, m15
> + pmaxsw m8, m15
> + pmaxsw m3, m15
> + movu [uq+xq*2], m7
> + movu [uq+xq*2+mmsize], m8
> + movu [vq+xq*2], m0
> + movu [vq+xq*2+mmsize], m3
> +%endif
> +%endif ; %2 ==/!= 1
> +
> + add xq, mmsize >> %2
> + cmp xd, wwd
> + jl .loop_h
> +
> +%if %3 == 0
> + add yq, ysq
> +%else ; %3 != 0
> + lea yq, [yq+ysq*2]
> +%endif ; %3 ==/!= 0
> + add uq, usq
> + add vq, vsq
> + lea rq, [rq+rgbsq*(2<<%3)]
> + lea gq, [gq+rgbsq*(2<<%3)]
> + lea bq, [bq+rgbsq*(2<<%3)]
> + dec hd
> + jg .loop_v
> +
> + RET
> +%endmacro
> +
> +%macro RGB2YUV_FNS 2
> +RGB2YUV_FN 8, %1, %2
> +RGB2YUV_FN 10, %1, %2
> +RGB2YUV_FN 12, %1, %2
> +%endmacro
> +
> +INIT_XMM sse2
> +RGB2YUV_FNS 0, 0
> +RGB2YUV_FNS 1, 0
> +RGB2YUV_FNS 1, 1
> +
> +; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
> +; int w, int h, const int16_t coeff[3][3][8])
> +INIT_XMM sse2
> +cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
> + movh m0, [cq+ 0]
> + movh m1, [cq+ 32]
> + movh m2, [cq+ 48]
> + movh m3, [cq+ 80]
> + movh m4, [cq+ 96]
> + movh m5, [cq+128]
> + punpcklwd m0, [cq+ 16]
> + punpcklwd m1, [pw_8192]
> + punpcklwd m2, [cq+ 64]
> + punpcklwd m3, [pw_8192]
> + punpcklwd m4, [cq+112]
> + punpcklwd m5, [pw_8192]
> +
> + DEFINE_ARGS data0, stride, ww, h, data1, data2, x
> + shl strideq, 1
> + mov data1q, [data0q+gprsize*1]
> + mov data2q, [data0q+gprsize*2]
> + mov data0q, [data0q+gprsize*0]
> +
> +.loop_v:
> + xor xd, xd
> +
> +.loop_h:
> + mova m6, [data0q+xq*2]
> + mova m7, [data1q+xq*2]
> + mova m8, [data2q+xq*2]
> + SBUTTERFLY wd, 6, 7, 9
> + punpckhwd m9, m8, [pw_1]
> + punpcklwd m8, [pw_1]
> +
> + pmaddwd m10, m6, m0
> + pmaddwd m11, m7, m0
> + pmaddwd m12, m8, m1
> + pmaddwd m13, m9, m1
> + paddd m10, m12
> + paddd m11, m13
> + psrad m10, 14
> + psrad m11, 14
> +
> + pmaddwd m12, m6, m2
> + pmaddwd m13, m7, m2
> + pmaddwd m14, m8, m3
> + pmaddwd m15, m9, m3
> + paddd m12, m14
> + paddd m13, m15
> + psrad m12, 14
> + psrad m13, 14
> +
> + pmaddwd m6, m4
> + pmaddwd m7, m4
> + pmaddwd m8, m5
> + pmaddwd m9, m5
> + paddd m6, m8
> + paddd m7, m9
> + psrad m6, 14
> + psrad m7, 14
> +
> + packssdw m10, m11
> + packssdw m12, m13
> + packssdw m6, m7
> +
> + mova [data0q+xq*2], m10
> + mova [data1q+xq*2], m12
> + mova [data2q+xq*2], m6
> +
> + add xd, mmsize / 2
> + cmp xd, wwd
> + jl .loop_h
> +
> + add data0q, strideq
> + add data1q, strideq
> + add data2q, strideq
> + dec hd
> + jg .loop_v
> +
> + RET
> +%endif
> diff --git a/libavfilter/x86/colorspacedsp_init.c
> b/libavfilter/x86/colorspacedsp_init.c
> new file mode 100644
> index 0000000..78d34bc
> --- /dev/null
> +++ b/libavfilter/x86/colorspacedsp_init.c
> @@ -0,0 +1,119 @@
> +/*
> + * Copyright (c) 2016 Ronald S. Bultje <rsbultje at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/x86/cpu.h"
> +
> +#include "libavfilter/colorspacedsp.h"
> +
> +#define decl_yuv2yuv_fn(t) \
> +void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t
> yuv_out_stride[3], \
> + uint8_t *yuv_in[3], ptrdiff_t
> yuv_in_stride[3], \
> + int w, int h, const int16_t
> yuv2yuv_coeffs[3][3][8], \
> + const int16_t yuv_offset[2][8])
> +
> +#define decl_yuv2yuv_fns(ss) \
> +decl_yuv2yuv_fn(ss##p8to8); \
> +decl_yuv2yuv_fn(ss##p10to8); \
> +decl_yuv2yuv_fn(ss##p12to8); \
> +decl_yuv2yuv_fn(ss##p8to10); \
> +decl_yuv2yuv_fn(ss##p10to10); \
> +decl_yuv2yuv_fn(ss##p12to10); \
> +decl_yuv2yuv_fn(ss##p8to12); \
> +decl_yuv2yuv_fn(ss##p10to12); \
> +decl_yuv2yuv_fn(ss##p12to12)
> +
> +decl_yuv2yuv_fns(420);
> +decl_yuv2yuv_fns(422);
> +decl_yuv2yuv_fns(444);
> +
> +#define decl_yuv2rgb_fn(t) \
> +void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \
> + uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \
> + int w, int h, const int16_t coeff[3][3][8], \
> + const int16_t yuv_offset[8])
> +
> +#define decl_yuv2rgb_fns(ss) \
> +decl_yuv2rgb_fn(ss##p8); \
> +decl_yuv2rgb_fn(ss##p10); \
> +decl_yuv2rgb_fn(ss##p12)
> +
> +decl_yuv2rgb_fns(420);
> +decl_yuv2rgb_fns(422);
> +decl_yuv2rgb_fns(444);
> +
> +#define decl_rgb2yuv_fn(t) \
> +void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \
> + int16_t *rgb_in[3], ptrdiff_t rgb_stride, \
> + int w, int h, const int16_t coeff[3][3][8], \
> + const int16_t yuv_offset[8])
> +
> +#define decl_rgb2yuv_fns(ss) \
> +decl_rgb2yuv_fn(ss##p8); \
> +decl_rgb2yuv_fn(ss##p10); \
> +decl_rgb2yuv_fn(ss##p12)
> +
> +decl_rgb2yuv_fns(420);
> +decl_rgb2yuv_fns(422);
> +decl_rgb2yuv_fns(444);
> +
> +void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, int w, int h,
> + const int16_t coeff[3][3][8]);
> +
> +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) {
> +#define assign_yuv2yuv_fns(idx, ss) \
> + dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \
> + dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \
> + dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \
> + dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \
> + dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \
> + dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \
> + dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \
> + dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \
> + dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2
> +
> + assign_yuv2yuv_fns(2, 420);
> + assign_yuv2yuv_fns(1, 422);
> + assign_yuv2yuv_fns(0, 444);
> +
> +#define assign_yuv2rgb_fns(idx, ss) \
> + dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \
> + dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \
> + dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2
> +
> + assign_yuv2rgb_fns(2, 420);
> + assign_yuv2rgb_fns(1, 422);
> + assign_yuv2rgb_fns(0, 444);
> +
> +#define assign_rgb2yuv_fns(idx, ss) \
> + dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \
> + dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \
> + dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2
> +
> + assign_rgb2yuv_fns(2, 420);
> + assign_rgb2yuv_fns(1, 422);
> + assign_rgb2yuv_fns(0, 444);
> +
> + dsp->multiply3x3 = ff_multiply3x3_sse2;
> + }
> +}
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index c24e797..81a8b86 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
>
> # libavfilter tests
> AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
> +AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
>
> CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
>
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index fb8defd..e4ca116 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -106,6 +106,9 @@ static const struct {
> #if CONFIG_BLEND_FILTER
> { "vf_blend", checkasm_check_blend },
> #endif
> + #if CONFIG_COLORSPACE_FILTER
> + { "vf_colorspace", checkasm_check_colorspace },
> + #endif
> #endif
> { NULL }
> };
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index 159a0a8..5a76f74 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -33,6 +33,7 @@
> void checkasm_check_alacdsp(void);
> void checkasm_check_blend(void);
> void checkasm_check_bswapdsp(void);
> +void checkasm_check_colorspace(void);
> void checkasm_check_flacdsp(void);
> void checkasm_check_fmtconvert(void);
> void checkasm_check_h264pred(void);
> diff --git a/tests/checkasm/vf_colorspace.c
> b/tests/checkasm/vf_colorspace.c
> new file mode 100644
> index 0000000..fcbb62a
> --- /dev/null
> +++ b/tests/checkasm/vf_colorspace.c
> @@ -0,0 +1,314 @@
> +/*
> + * Copyright (c) 2016 Ronald S. Bultje <rsbultje at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include <string.h>
> +#include "checkasm.h"
> +#include "libavfilter/colorspacedsp.h"
> +#include "libavutil/common.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/intreadwrite.h"
> +
> +#define W 64
> +#define H 64
> +
> +#define randomize_buffers() \
> + do { \
> + unsigned mask = bpp_mask[idepth]; \
> + int n, m; \
> + int bpp = 1 + (!!idepth); \
> + int buf_size = W * H * bpp; \
> + for (m = 0; m < 3; m++) { \
> + int ss = m ? ss_w + ss_h : 0; \
> + int plane_sz = buf_size >> ss; \
> + for (n = 0; n < plane_sz; n += 4) { \
> + unsigned r = rnd() & mask; \
> + AV_WN32A(&src[m][n], r); \
> + } \
> + } \
> + } while (0)
> +
> +static const char *format_string[] = {
> + "444", "422", "420"
> +};
> +
> +static unsigned bpp_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
> +
> +static void check_yuv2yuv(void)
> +{
> + declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3],
> + uint8_t *src[3], ptrdiff_t src_stride[3],
> + int w, int h, const int16_t coeff[3][3][8],
> + const int16_t off[2][8]);
> + ColorSpaceDSPContext dsp;
> + int idepth, odepth, fmt, n;
> + LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]);
> + uint8_t *src[3] = { src_y, src_u, src_v };
> + LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H * 2]);
> + uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y,
> dst1_u, dst1_v };
> + LOCAL_ALIGNED_32(int16_t, offset_buf, [16]);
> + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
> + int16_t (*offset)[8] = (int16_t(*)[8]) offset_buf;
> + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
> +
> + ff_colorspacedsp_init(&dsp);
> + for (n = 0; n < 8; n++) {
> + offset[0][n] = offset[1][n] = 16;
> +
> + coeff[0][0][n] = (1 << 14) + (1 << 7) + 1;
> + coeff[0][1][n] = (1 << 7) - 1;
> + coeff[0][2][n] = -(1 << 8);
> + coeff[1][0][n] = coeff[2][0][n] = 0;
> + coeff[1][1][n] = (1 << 14) + (1 << 7);
> + coeff[1][2][n] = -(1 << 7);
> + coeff[2][2][n] = (1 << 14) - (1 << 6);
> + coeff[2][1][n] = 1 << 6;
> + }
> + for (idepth = 0; idepth < 3; idepth++) {
> + for (odepth = 0; odepth < 3; odepth++) {
> + for (fmt = 0; fmt < 3; fmt++) {
> + if (check_func(dsp.yuv2yuv[idepth][odepth][fmt],
> + "ff_colorspacedsp_yuv2yuv_%sp%dto%d",
> + format_string[fmt],
> + idepth * 2 + 8, odepth * 2 + 8)) {
> + int ss_w = !!fmt, ss_h = fmt == 2;
> + int y_src_stride = W << !!idepth, y_dst_stride = W <<
> !!odepth;
> + int uv_src_stride = y_src_stride >> ss_w,
> uv_dst_stride = y_dst_stride >> ss_w;
> +
> + randomize_buffers();
> + call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride,
> uv_dst_stride, uv_dst_stride },
> + src, (ptrdiff_t[3]) { y_src_stride,
> uv_src_stride, uv_src_stride },
> + W, H, coeff, offset);
> + call_new(dst1, (ptrdiff_t[3]) { y_dst_stride,
> uv_dst_stride, uv_dst_stride },
> + src, (ptrdiff_t[3]) { y_src_stride,
> uv_src_stride, uv_src_stride },
> + W, H, coeff, offset);
> + if (memcmp(dst0[0], dst1[0], y_dst_stride * H) ||
> + memcmp(dst0[1], dst1[1], uv_dst_stride * H >>
> ss_h) ||
> + memcmp(dst0[2], dst1[2], uv_dst_stride * H >>
> ss_h)) {
> + fail();
> + }
> + }
> + }
> + }
> + }
> +
> + report("yuv2yuv");
> +}
> +
> +static void check_yuv2rgb(void)
> +{
> + declare_func(void, int16_t *dst[3], ptrdiff_t dst_stride,
> + uint8_t *src[3], ptrdiff_t src_stride[3],
> + int w, int h, const int16_t coeff[3][3][8],
> + const int16_t off[8]);
> + ColorSpaceDSPContext dsp;
> + int idepth, fmt, n;
> + LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]);
> + LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]);
> + uint8_t *src[3] = { src_y, src_u, src_v };
> + LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]);
> + int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y,
> dst1_u, dst1_v };
> + LOCAL_ALIGNED_32(int16_t, offset, [8]);
> + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
> + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
> +
> + ff_colorspacedsp_init(&dsp);
> + for (n = 0; n < 8; n++) {
> + offset[n] = 16;
> +
> + coeff[0][0][n] = coeff[1][0][n] = coeff[2][0][n] = (1 << 14) | 1;
> + coeff[0][1][n] = coeff[2][2][n] = 0;
> + coeff[0][2][n] = 1 << 13;
> + coeff[1][1][n] = -(1 << 12);
> + coeff[1][2][n] = 1 << 12;
> + coeff[2][1][n] = 1 << 11;
> + }
> + for (idepth = 0; idepth < 3; idepth++) {
> + for (fmt = 0; fmt < 3; fmt++) {
> + if (check_func(dsp.yuv2rgb[idepth][fmt],
> + "ff_colorspacedsp_yuv2rgb_%sp%d",
> + format_string[fmt], idepth * 2 + 8)) {
> + int ss_w = !!fmt, ss_h = fmt == 2;
> + int y_src_stride = W << !!idepth;
> + int uv_src_stride = y_src_stride >> ss_w;
> +
> + randomize_buffers();
> + call_ref(dst0, W, src,
> + (ptrdiff_t[3]) { y_src_stride, uv_src_stride,
> uv_src_stride },
> + W, H, coeff, offset);
> + call_new(dst1, W, src,
> + (ptrdiff_t[3]) { y_src_stride, uv_src_stride,
> uv_src_stride },
> + W, H, coeff, offset);
> + if (memcmp(dst0[0], dst1[0], W * H * sizeof(int16_t)) ||
> + memcmp(dst0[1], dst1[1], W * H * sizeof(int16_t)) ||
> + memcmp(dst0[2], dst1[2], W * H * sizeof(int16_t))) {
> + fail();
> + }
> + }
> + }
> + }
> +
> + report("yuv2rgb");
> +}
> +
> +#undef randomize_buffers
> +#define randomize_buffers() \
> + do { \
> + int y, x, p; \
> + for (p = 0; p < 3; p++) { \
> + for (y = 0; y < H; y++) { \
> + for (x = 0; x < W; x++) { \
> + int r = rnd() & 0x7fff; \
> + r -= (32768 - 28672) >> 1; \
> + src[p][y * W + x] = r; \
> + } \
> + } \
> + } \
> + } while (0)
> +
> +static void check_rgb2yuv(void)
> +{
> + declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3],
> + int16_t *src[3], ptrdiff_t src_stride,
> + int w, int h, const int16_t coeff[3][3][8],
> + const int16_t off[8]);
> + ColorSpaceDSPContext dsp;
> + int odepth, fmt, n;
> + LOCAL_ALIGNED_32(int16_t, src_y, [W * H * 2]);
> + LOCAL_ALIGNED_32(int16_t, src_u, [W * H * 2]);
> + LOCAL_ALIGNED_32(int16_t, src_v, [W * H * 2]);
> + int16_t *src[3] = { src_y, src_u, src_v };
> + LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H]);
> + LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H]);
> + LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H]);
> + LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H]);
> + uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y,
> dst1_u, dst1_v };
> + LOCAL_ALIGNED_32(int16_t, offset, [8]);
> + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
> + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
> +
> + ff_colorspacedsp_init(&dsp);
> + for (n = 0; n < 8; n++) {
> + offset[n] = 16;
> +
> + // these somewhat resemble bt601/smpte170m coefficients
> + coeff[0][0][n] = lrint(0.3 * (1 << 14));
> + coeff[0][1][n] = lrint(0.6 * (1 << 14));
> + coeff[0][2][n] = lrint(0.1 * (1 << 14));
> + coeff[1][0][n] = lrint(-0.15 * (1 << 14));
> + coeff[1][1][n] = lrint(-0.35 * (1 << 14));
> + coeff[1][2][n] = lrint(0.5 * (1 << 14));
> + coeff[2][0][n] = lrint(0.5 * (1 << 14));
> + coeff[2][1][n] = lrint(-0.42 * (1 << 14));
> + coeff[2][2][n] = lrint(-0.08 * (1 << 14));
> + }
> + for (odepth = 0; odepth < 3; odepth++) {
> + for (fmt = 0; fmt < 3; fmt++) {
> + if (check_func(dsp.rgb2yuv[odepth][fmt],
> + "ff_colorspacedsp_rgb2yuv_%sp%d",
> + format_string[fmt], odepth * 2 + 8)) {
> + int ss_w = !!fmt, ss_h = fmt == 2;
> + int y_dst_stride = W << !!odepth;
> + int uv_dst_stride = y_dst_stride >> ss_w;
> +
> + randomize_buffers();
> + call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride,
> uv_dst_stride, uv_dst_stride },
> + src, W, W, H, coeff, offset);
> + call_new(dst1, (ptrdiff_t[3]) { y_dst_stride,
> uv_dst_stride, uv_dst_stride },
> + src, W, W, H, coeff, offset);
> + if (memcmp(dst0[0], dst1[0], H * y_dst_stride) ||
> + memcmp(dst0[1], dst1[1], H * uv_dst_stride >> ss_h) ||
> + memcmp(dst0[2], dst1[2], H * uv_dst_stride >> ss_h)) {
> + fail();
> + }
> + }
> + }
> + }
> +
> + report("rgb2yuv");
> +}
> +
> +static void check_multiply3x3(void)
> +{
> + declare_func(void, int16_t *data[3], ptrdiff_t stride,
> + int w, int h, const int16_t coeff[3][3][8]);
> + ColorSpaceDSPContext dsp;
> + LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]);
> + LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]);
> + int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y,
> dst1_u, dst1_v };
> + int16_t **src = dst0;
> + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
> + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
> + int n;
> +
> + ff_colorspacedsp_init(&dsp);
> + for (n = 0; n < 8; n++) {
> + coeff[0][0][n] = lrint(0.85 * (1 << 14));
> + coeff[0][1][n] = lrint(0.10 * (1 << 14));
> + coeff[0][2][n] = lrint(0.05 * (1 << 14));
> + coeff[1][0][n] = lrint(-0.1 * (1 << 14));
> + coeff[1][1][n] = lrint(0.95 * (1 << 14));
> + coeff[1][2][n] = lrint(0.15 * (1 << 14));
> + coeff[2][0][n] = lrint(-0.2 * (1 << 14));
> + coeff[2][1][n] = lrint(0.30 * (1 << 14));
> + coeff[2][2][n] = lrint(0.90 * (1 << 14));
> + }
> + if (check_func(dsp.multiply3x3, "ff_colorspacedsp_multiply3x3")) {
> + randomize_buffers();
> + memcpy(dst1_y, dst0_y, W * H * sizeof(*dst1_y));
> + memcpy(dst1_u, dst0_u, W * H * sizeof(*dst1_u));
> + memcpy(dst1_v, dst0_v, W * H * sizeof(*dst1_v));
> + call_ref(dst0, W, W, H, coeff);
> + call_new(dst1, W, W, H, coeff);
> + if (memcmp(dst0[0], dst1[0], H * W * sizeof(*dst0_y)) ||
> + memcmp(dst0[1], dst1[1], H * W * sizeof(*dst0_u)) ||
> + memcmp(dst0[2], dst1[2], H * W * sizeof(*dst0_v))) {
> + fail();
> + }
> + }
> +
> + report("multiply3x3");
> +}
> +
> +void checkasm_check_colorspace(void)
> +{
> + check_yuv2yuv();
> + check_yuv2rgb();
> + check_rgb2yuv();
> + check_multiply3x3();
> +}
> --
> 2.1.2
>
Otherwise seems ok.
Kieran
More information about the ffmpeg-devel
mailing list