[FFmpeg-devel] [PATCH] x86: hevc: adding transform_add

Thu Jul 31 22:42:29 CEST 2014

On 31/07/14 11:58 AM, Pierre Edouard Lepere wrote:
> Hi,
> Here's a new version of the patch with the feedback provided.
> 
> Best Regards,
> 
> Pierre-Edouard Lepere

> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7469293..658ad5e 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
>  YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
>  YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
>                                            x86/hevc_deblock.o            \
> -                                          x86/hevc_idct.o
> +                                          x86/hevc_idct.o               \
> +                                          x86/hevc_res_add.o
>  YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
>  YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
>  YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
> diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
> new file mode 100644
> index 0000000..2bfd35d
> --- /dev/null
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -0,0 +1,396 @@
> +; /*
> +; * Provide SSE optimizations for transform_add functions for HEVC decoding

nit: SIMD instead of SSE.

> +; * Copyright (c) 2014 Pierre-Edouard LEPERE
> +; *
> +; * This file is part of FFmpeg.
> +; *
> +; * FFmpeg is free software; you can redistribute it and/or
> +; * modify it under the terms of the GNU Lesser General Public
> +; * License as published by the Free Software Foundation; either
> +; * version 2.1 of the License, or (at your option) any later version.
> +; *
> +; * FFmpeg is distributed in the hope that it will be useful,
> +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +; * Lesser General Public License for more details.
> +; *
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with FFmpeg; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +; */
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32
> +max_pixels_10:          times 16  dw ((1 << 10)-1)
> +tr_add_10:              times 4 dd ((1 << 14-10) + 1)

This constant seems unused.

> +
> +
> +SECTION .text
> +
> +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
> +%macro TR_ADD_MMX_4_8 0
> +    mova              m2, [r1]
> +    mova              m4, [r1+8]
> +    pxor              m3, m3
> +    psubw             m3, m2
> +    packuswb          m2, m2
> +    packuswb          m3, m3
> +    pxor              m5, m5
> +    psubw             m5, m4
> +    packuswb          m4, m4
> +    packuswb          m5, m5
> +
> +    movh              m0, [r0     ]
> +    movh              m1, [r0+r2  ]
> +    paddusb           m0, m2
> +    paddusb           m1, m4
> +    psubusb           m0, m3
> +    psubusb           m1, m5
> +    movh       [r0     ], m0
> +    movh       [r0+r2  ], m1
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add4_8, 3, 4, 6
> +    TR_ADD_MMX_4_8
> +    add               r1, 16
> +    lea               r0, [r0+r2*2]
> +    TR_ADD_MMX_4_8
> +    RET
> +
> +%macro TR_ADD_SSE_8_8 0
> +    pxor              m3, m3
> +    mova              m4, [r1]
> +    mova              m6, [r1+16]
> +    mova              m0, [r1+32]
> +    mova              m2, [r1+48]
> +    psubw             m5, m3, m4
> +    psubw             m7, m3, m6
> +    psubw             m1, m3, m0
> +    packuswb          m4, m0
> +    packuswb          m5, m1
> +    psubw             m3, m2
> +    packuswb          m6, m2
> +    packuswb          m7, m3
> +
> +    movq                m0, [r0     ]
> +    movq                m1, [r0+r2  ]
> +    movhps              m0, [r0+r2*2]
> +    movhps              m1, [r0+r3  ]
> +    paddusb             m0, m4
> +    paddusb             m1, m6
> +    psubusb             m0, m5
> +    psubusb             m1, m7
> +    movq         [r0     ], m0
> +    movq         [r0+r2  ], m1
> +    movhps       [r0+2*r2], m0
> +    movhps       [r0+r3  ], m1
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_8 0
> +    pxor              m0, m0
> +
> +    mova              m4, [r1]
> +    mova              m1, [r1+16]
> +    psubw             m2, m0, m1
> +    psubw             m5, m0, m4

Add avx versions of add16 and add32 like you originally did for the 10bit 
functions (also with different instruction order depending on avx_enabled if 
possible).
This macro is used up to 16 times in a single function, so all the saved movas 
will make a difference.

> +    packuswb          m4, m1
> +    packuswb          m5, m2
> +
> +    mova              m6, [r1+32]
> +    mova              m1, [r1+48]
> +    psubw             m2, m0, m1
> +    psubw             m7, m0, m6
> +    packuswb          m6, m1
> +    packuswb          m7, m2
> +
> +    mova              m8, [r1+64]
> +    mova              m1, [r1+80]
> +    psubw             m2, m0, m1
> +    psubw             m9, m0, m8
> +    packuswb          m8, m1
> +    packuswb          m9, m2
> +
> +    mova             m10, [r1+96]
> +    mova              m1, [r1+112]
> +    psubw             m2, m0, m1
> +    psubw            m11, m0, m10
> +    packuswb         m10, m1
> +    packuswb         m11, m2
> +%endmacro
> +
> +
> +%macro TR_ADD_SSE_16_8 0
> +    TR_ADD_INIT_SSE_8
> +
> +    mova              m0, [r0     ]
> +    mova              m1, [r0+r2  ]
> +    mova              m2, [r0+r2*2]
> +    mova              m3, [r0+r3  ]

No need for these. The paddusb below can fetch the data from memory.

With some refactoring you can get this and the add32 function to use 10 or maybe 
even 8 xmm registers.
Getting it down to 10 will barely make a difference, but 8 will allow the 
functions to work on x86_32.
It's not really important, and I can give it a go if you'd rather work on other 
dsp functions. This can be committed with the above changes alone.

> +    paddusb           m0, m4
> +    paddusb           m1, m6
> +    paddusb           m2, m8
> +    paddusb           m3, m10
> +    psubusb           m0, m5
> +    psubusb           m1, m7
> +    psubusb           m2, m9
> +    psubusb           m3, m11
> +    mova       [r0     ], m0
> +    mova       [r0+r2  ], m1
> +    mova       [r0+2*r2], m2
> +    mova       [r0+r3  ], m3
> +%endmacro
> +
> +%macro TR_ADD_SSE_32_8 0
> +    TR_ADD_INIT_SSE_8
> +
> +    mova              m0, [r0      ]
> +    mova              m1, [r0+16   ]
> +    mova              m2, [r0+r2   ]
> +    mova              m3, [r0+r2+16]

Same as above.

> +    paddusb           m0, m4
> +    paddusb           m1, m6
> +    paddusb           m2, m8
> +    paddusb           m3, m10
> +    psubusb           m0, m5
> +    psubusb           m1, m7
> +    psubusb           m2, m9
> +    psubusb           m3, m11
> +    mova      [r0      ], m0
> +    mova      [r0+16   ], m1
> +    mova      [r0+r2   ], m2
> +    mova      [r0+r2+16], m3
> +%endmacro

[...]

> diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
> index 4bcc8dc..5c3a51c 100644
> --- a/libavcodec/x86/hevcdsp.h
> +++ b/libavcodec/x86/hevcdsp.h
> @@ -131,4 +131,25 @@ WEIGHTING_PROTOTYPES(8, sse4);
>  WEIGHTING_PROTOTYPES(10, sse4);
>  WEIGHTING_PROTOTYPES(12, sse4);
>  
> +///////////////////////////////////////////////////////////////////////////////
> +// TRANSFORM_ADD
> +///////////////////////////////////////////////////////////////////////////////
> +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +
> +void ff_hevc_transform_add8_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);

These three are not needed anymore.