[FFmpeg-devel] [PATCH] x86: hevc: adding transform_add
James Almer
jamrial at gmail.com
Thu Jul 31 22:42:29 CEST 2014
On 31/07/14 11:58 AM, Pierre Edouard Lepere wrote:
> Hi,
> Here's a new version of the patch with the feedback provided.
>
> Best Regards,
>
> Pierre-Edouard Lepere
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7469293..658ad5e 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
> YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
> YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \
> x86/hevc_deblock.o \
> - x86/hevc_idct.o
> + x86/hevc_idct.o \
> + x86/hevc_res_add.o
> YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
> YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
> YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
> diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
> new file mode 100644
> index 0000000..2bfd35d
> --- /dev/null
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -0,0 +1,396 @@
> +; /*
> +; * Provide SSE optimizations for transform_add functions for HEVC decoding
nit: SIMD instead of SSE.
> +; * Copyright (c) 2014 Pierre-Edouard LEPERE
> +; *
> +; * This file is part of FFmpeg.
> +; *
> +; * FFmpeg is free software; you can redistribute it and/or
> +; * modify it under the terms of the GNU Lesser General Public
> +; * License as published by the Free Software Foundation; either
> +; * version 2.1 of the License, or (at your option) any later version.
> +; *
> +; * FFmpeg is distributed in the hope that it will be useful,
> +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +; * Lesser General Public License for more details.
> +; *
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with FFmpeg; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +; */
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32
> +max_pixels_10: times 16 dw ((1 << 10)-1)
> +tr_add_10: times 4 dd ((1 << 14-10) + 1)
This constant seems unused.
> +
> +
> +SECTION .text
> +
> +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
> +%macro TR_ADD_MMX_4_8 0
> + mova m2, [r1]
> + mova m4, [r1+8]
> + pxor m3, m3
> + psubw m3, m2
> + packuswb m2, m2
> + packuswb m3, m3
> + pxor m5, m5
> + psubw m5, m4
> + packuswb m4, m4
> + packuswb m5, m5
> +
> + movh m0, [r0 ]
> + movh m1, [r0+r2 ]
> + paddusb m0, m2
> + paddusb m1, m4
> + psubusb m0, m3
> + psubusb m1, m5
> + movh [r0 ], m0
> + movh [r0+r2 ], m1
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add4_8, 3, 4, 6
> + TR_ADD_MMX_4_8
> + add r1, 16
> + lea r0, [r0+r2*2]
> + TR_ADD_MMX_4_8
> + RET
> +
> +%macro TR_ADD_SSE_8_8 0
> + pxor m3, m3
> + mova m4, [r1]
> + mova m6, [r1+16]
> + mova m0, [r1+32]
> + mova m2, [r1+48]
> + psubw m5, m3, m4
> + psubw m7, m3, m6
> + psubw m1, m3, m0
> + packuswb m4, m0
> + packuswb m5, m1
> + psubw m3, m2
> + packuswb m6, m2
> + packuswb m7, m3
> +
> + movq m0, [r0 ]
> + movq m1, [r0+r2 ]
> + movhps m0, [r0+r2*2]
> + movhps m1, [r0+r3 ]
> + paddusb m0, m4
> + paddusb m1, m6
> + psubusb m0, m5
> + psubusb m1, m7
> + movq [r0 ], m0
> + movq [r0+r2 ], m1
> + movhps [r0+2*r2], m0
> + movhps [r0+r3 ], m1
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_8 0
> + pxor m0, m0
> +
> + mova m4, [r1]
> + mova m1, [r1+16]
> + psubw m2, m0, m1
> + psubw m5, m0, m4
Add avx versions of add16 and add32 like you originally did for the 10bit
functions (also with different instruction order depending on avx_enabled if
possible).
This macro is used up to 16 times in a single function, so all the saved movas
will make a difference.
> + packuswb m4, m1
> + packuswb m5, m2
> +
> + mova m6, [r1+32]
> + mova m1, [r1+48]
> + psubw m2, m0, m1
> + psubw m7, m0, m6
> + packuswb m6, m1
> + packuswb m7, m2
> +
> + mova m8, [r1+64]
> + mova m1, [r1+80]
> + psubw m2, m0, m1
> + psubw m9, m0, m8
> + packuswb m8, m1
> + packuswb m9, m2
> +
> + mova m10, [r1+96]
> + mova m1, [r1+112]
> + psubw m2, m0, m1
> + psubw m11, m0, m10
> + packuswb m10, m1
> + packuswb m11, m2
> +%endmacro
> +
> +
> +%macro TR_ADD_SSE_16_8 0
> + TR_ADD_INIT_SSE_8
> +
> + mova m0, [r0 ]
> + mova m1, [r0+r2 ]
> + mova m2, [r0+r2*2]
> + mova m3, [r0+r3 ]
No need for these. The paddusb below can fetch the data from memory.
With some refactoring you can get this and the add32 function to use 10 or maybe
even 8 xmm registers.
Getting it down to 10 will barely make a difference, but 8 will allow the
functions to work on x86_32.
It's not really important, and I can give it a go if you'd rather work on other
dsp functions. This can be committed with the above changes alone.
> + paddusb m0, m4
> + paddusb m1, m6
> + paddusb m2, m8
> + paddusb m3, m10
> + psubusb m0, m5
> + psubusb m1, m7
> + psubusb m2, m9
> + psubusb m3, m11
> + mova [r0 ], m0
> + mova [r0+r2 ], m1
> + mova [r0+2*r2], m2
> + mova [r0+r3 ], m3
> +%endmacro
> +
> +%macro TR_ADD_SSE_32_8 0
> + TR_ADD_INIT_SSE_8
> +
> + mova m0, [r0 ]
> + mova m1, [r0+16 ]
> + mova m2, [r0+r2 ]
> + mova m3, [r0+r2+16]
Same as above.
> + paddusb m0, m4
> + paddusb m1, m6
> + paddusb m2, m8
> + paddusb m3, m10
> + psubusb m0, m5
> + psubusb m1, m7
> + psubusb m2, m9
> + psubusb m3, m11
> + mova [r0 ], m0
> + mova [r0+16 ], m1
> + mova [r0+r2 ], m2
> + mova [r0+r2+16], m3
> +%endmacro
[...]
> diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
> index 4bcc8dc..5c3a51c 100644
> --- a/libavcodec/x86/hevcdsp.h
> +++ b/libavcodec/x86/hevcdsp.h
> @@ -131,4 +131,25 @@ WEIGHTING_PROTOTYPES(8, sse4);
> WEIGHTING_PROTOTYPES(10, sse4);
> WEIGHTING_PROTOTYPES(12, sse4);
>
> +///////////////////////////////////////////////////////////////////////////////
> +// TRANSFORM_ADD
> +///////////////////////////////////////////////////////////////////////////////
> +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +
> +void ff_hevc_transform_add8_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
These three are not needed anymore.
More information about the ffmpeg-devel
mailing list