[FFmpeg-devel] [PATCH] x86: hevc: adding transform_add
James Almer
jamrial at gmail.com
Wed Jul 30 20:39:53 CEST 2014
On 30/07/14 10:33 AM, Pierre Edouard Lepere wrote:
> Hi,
>
> Here's a patch adding ASM transform_add functions for HEVC.
>
> Regards,
> Pierre-Edouard Lepere
Some remarks below.
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7469293..658ad5e 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
> YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
> YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \
> x86/hevc_deblock.o \
> - x86/hevc_idct.o
> + x86/hevc_idct.o \
> + x86/hevc_res_add.o
Why not adding this to hevc_idct.c? It technically used to be there before the rext split.
> YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
> YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
> YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
> diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
> new file mode 100644
> index 0000000..bc550ef
> --- /dev/null
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -0,0 +1,454 @@
> +; /*
> +; * Provide intrinsics for transform_add functions for HEVC decoding
It's not intrisics anymore :P
> +; * Copyright (c) 2014 Pierre-Edouard LEPERE
> +; *
> +; * This file is part of FFmpeg.
> +; *
> +; * FFmpeg is free software; you can redistribute it and/or
> +; * modify it under the terms of the GNU Lesser General Public
> +; * License as published by the Free Software Foundation; either
> +; * version 2.1 of the License, or (at your option) any later version.
> +; *
> +; * FFmpeg is distributed in the hope that it will be useful,
> +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +; * Lesser General Public License for more details.
> +; *
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with FFmpeg; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +; */
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
SECTION_RODATA 32
It's needed for the AVX2 functions.
> +max_pixels_10: times 16 dw ((1 << 10)-1)
> +tr_add_10: times 4 dd ((1 << 14-10) + 1)
> +
> +
> +SECTION .text
> +
> +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
> +%macro TR_ADD_INIT_MMX 2
> + mova m2, [r1]
> + mova m4, [r1+8]
> + pxor m3, m3
> + psubw m3, m2
> + packuswb m2, m2
> + packuswb m3, m3
> + pxor m5, m5
> + psubw m5, m4
> + packuswb m4, m4
> + packuswb m5, m5
> +%endmacro
> +
> +%macro TR_ADD_OP_MMX 4
This macro is used only by one function. %1 is always movh.
Just hardcode it like the movas above.
> + %1 m0, [%2 ]
> + %1 m1, [%2+%3 ]
> + paddusb m0, m2
> + paddusb m1, m4
> + psubusb m0, m3
> + psubusb m1, m5
> + %1 [%2 ], m0
> + %1 [%2+%3 ], m1
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_8 2
> + movu m4, [r1]
> + movu m6, [r1+16]
> + movu m8, [r1+32]
> + movu m10, [r1+48]
> + lea %1, [%2*3]
> + pxor m5, m5
> + psubw m5, m4
> + packuswb m4, m4
> + packuswb m5, m5
> + pxor m7, m7
> + psubw m7, m6
> + packuswb m6, m6
> + packuswb m7, m7
> + pxor m9, m9
> + psubw m9, m8
> + packuswb m8, m8
> + packuswb m9, m9
> + pxor m11, m11
> + psubw m11, m10
> + packuswb m10, m10
> + packuswb m11, m11
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_16 2
> + lea %1, [%2*3]
> + movu m4, [r1]
> + movu m6, [r1+16]
> + pxor m5, m5
> + psubw m7, m5, m6
> + psubw m5, m4
> + packuswb m4, m6
> + packuswb m5, m7
> +
> + movu m6, [r1+32]
> + movu m8, [r1+48]
> + pxor m7, m7
> + psubw m9, m7, m8
> + psubw m7, m6
> + packuswb m6, m8
> + packuswb m7, m9
> +
> + movu m8, [r1+64]
> + movu m10, [r1+80]
> + pxor m9, m9
> + psubw m11, m9, m10
> + psubw m9, m8
> + packuswb m8, m10
> + packuswb m9, m11
> +
> + movu m10, [r1+96]
> + movu m12, [r1+112]
> + pxor m11, m11
> + psubw m13, m11, m12
> + psubw m11, m10
> + packuswb m10, m12
> + packuswb m11, m13
> +%endmacro
> +
> +%macro TR_ADD_OP_SSE 4
> + %1 m0, [%2 ]
> + %1 m1, [%2+%3 ]
> + %1 m2, [%2+%3*2]
> + %1 m3, [%2+%4 ]
> + paddusb m0, m4
> + paddusb m1, m6
> + paddusb m2, m8
> + paddusb m3, m10
> + psubusb m0, m5
> + psubusb m1, m7
> + psubusb m2, m9
> + psubusb m3, m11
> + %1 [%2 ], m0
> + %1 [%2+%3 ], m1
> + %1 [%2+2*%3], m2
> + %1 [%2+%4 ], m3
> +%endmacro
> +
> +%macro TR_ADD_OP_SSE_32 4
> + %1 m0, [%2 ]
> + %1 m1, [%2+16 ]
> + %1 m2, [%2+%3 ]
> + %1 m3, [%2+%3+16]
> + paddusb m0, m4
> + paddusb m1, m6
> + paddusb m2, m8
> + paddusb m3, m10
> + psubusb m0, m5
> + psubusb m1, m7
> + psubusb m2, m9
> + psubusb m3, m11
> + %1 [%2 ], m0
> + %1 [%2+16 ], m1
> + %1 [%2+%3 ], m2
> + %1 [%2+%3+16], m3
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add4_8, 3, 4, 6
> + TR_ADD_INIT_MMX r3, r2
> + TR_ADD_OP_MMX movh, r0, r2, r3
> + lea r1, [r1+16]
add r1, 16
There are several similar examples below of simple leas that can be add instead.
> + lea r0, [r0+r2*2]
> + TR_ADD_INIT_MMX r3, r2
> + TR_ADD_OP_MMX movh, r0, r2, r3
> + RET
> +
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add8_8, 3, 4, 6
You're using 12 xmm registers, but only declaring 6.
> + TR_ADD_INIT_SSE_8 r3, r2
> + TR_ADD_OP_SSE movh, r0, r2, r3
> + lea r1, [r1+8*8]
> + lea r0, [r0+r2*4]
> + TR_ADD_INIT_SSE_8 r3, r2
> + TR_ADD_OP_SSE movh, r0, r2, r3
> + RET
> +
> +
> +; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add16_8, 3, 4, 6
Same as above, you're using 14 xmm registers here, and in the function below.
Do you really need that many? It should be possible to use less if you rearrange
some instructions.
> + TR_ADD_INIT_SSE_16 r3, r2
> + TR_ADD_OP_SSE mova, r0, r2, r3
> +%rep 3
> + lea r1, [r1+16*8]
> + lea r0, [r0+r2*4]
> + TR_ADD_INIT_SSE_16 r3, r2
> + TR_ADD_OP_SSE mova, r0, r2, r3
> +%endrep
> + RET
> +
> +; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add32_8, 3, 4, 6
> + TR_ADD_INIT_SSE_16 r3, r2
> + TR_ADD_OP_SSE_32 mova, r0, r2, r3
> +%rep 15
> + lea r1, [r1+16*8]
> + lea r0, [r0+r2*2]
> + TR_ADD_INIT_SSE_16 r3, r2
> + TR_ADD_OP_SSE_32 mova, r0, r2, r3
> +%endrep
> + RET
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +
> +%endif ;HAVE_AVX2_EXTERNAL
This is not needed if you're not adding 8bit AVX2 functions for the time being.
> +%endif ;ARCH_X86_64
> +;-----------------------------------------------------------------------------
> +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
> +;-----------------------------------------------------------------------------
> +%macro TR_ADD_OP_10 4
> + movu m6, [%4]
> + movu m7, [%4+16]
> + movu m8, [%4+32]
> + movu m9, [%4+48]
> +%if avx_enabled
> + paddw m0, m6, [%1+0 ]
> + paddw m1, m7, [%1+%2 ]
> + paddw m2, m8, [%1+%2*2]
> + paddw m3, m9, [%1+%3 ]
> +%else
> + movu m0, [%1+0 ]
> + movu m1, [%1+%2 ]
> + movu m2, [%1+%2*2]
> + movu m3, [%1+%3 ]
> + paddw m0, m6
> + paddw m1, m7
> + paddw m2, m8
> + paddw m3, m9
This makes no sense. You're using four extra registers and four more movs
than necessary.
It would make sense if those movs needed to be unaligned, but they obviously
don't judging by the avx version.
mova m0, [%4]
mova m1, [%4+16]
mova m2, [%4+32]
mova m3, [%4+48]
paddw m0, [%1+0 ]
paddw m1, [%1+%2 ]
paddw m2, [%1+%2*2]
paddw m3, [%1+%3 ]
You can remove the xmm AVX versions after this, since there are no more three
operand instructions. Also, this would make the functions work with x86_32.
> +%endif
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> + CLIPW m2, m4, m5
> + CLIPW m3, m4, m5
> + movu [%1+0 ], m0
> + movu [%1+%2 ], m1
> + movu [%1+%2*2], m2
> + movu [%1+%3 ], m3
> +%endmacro
> +
> +%macro TR_ADD_MMX_10 3
> + mova m4, [%3]
> + mova m5, [%3+8]
> + mova m0, [%1+0 ]
> + mova m1, [%1+%2 ]
> + paddw m0, m4
> + paddw m1, m5
> + CLIPW m0, m2, m3
> + CLIPW m1, m2, m3
> + mova [%1+0 ], m0
> + mova [%1+%2 ], m1
> +%endmacro
> +
> +%macro TRANS_ADD16_10 3
> + mova m6, [%3]
> + mova m7, [%3+16]
> + mova m8, [%3+32]
> + mova m9, [%3+48]
> +
> +%if avx_enabled
> + paddw m0, m6, [%1 ]
> + paddw m1, m7, [%1+16 ]
> + paddw m2, m8, [%1+%2 ]
> + paddw m3, m9, [%1+%2+16]
> +%else
> + mova m0, [%1 ]
> + mova m1, [%1+16 ]
> + mova m2, [%1+%2 ]
> + mova m3, [%1+%2+16]
> + paddw m0, m6
> + paddw m1, m7
> + paddw m2, m8
> + paddw m3, m9
Same as above, here and below.
> +%endif
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> + CLIPW m2, m4, m5
> + CLIPW m3, m4, m5
> + mova [%1 ], m0
> + mova [%1+16 ], m1
> + mova [%1+%2 ], m2
> + mova [%1+%2+16], m3
> +%endmacro
> +
> +%macro TRANS_ADD32_10 2
> + mova m6, [%2]
> + mova m7, [%2+16]
> + mova m8, [%2+32]
> + mova m9, [%2+48]
> +
> +%if avx_enabled
> + paddw m0, m6, [%1 ]
> + paddw m1, m7, [%1+16]
> + paddw m2, m8, [%1+32]
> + paddw m3, m9, [%1+48]
> +%else
> + mova m0, [%1 ]
> + mova m1, [%1+16]
> + mova m2, [%1+32]
> + mova m3, [%1+48]
> + paddw m0, m6
> + paddw m1, m7
> + paddw m2, m8
> + paddw m3, m9
> +%endif
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> + CLIPW m2, m4, m5
> + CLIPW m3, m4, m5
> + mova [%1 ], m0
> + mova [%1+16], m1
> + mova [%1+32], m2
> + mova [%1+48], m3
> +%endmacro
> +
> +
> +
> +%macro TRANS_ADD16_AVX2 4
The macro names are a bit confusing. Some state the bitdepth, others don't, etc.
It's mostly cosmetic and not really important, but it would be nice if the names
were more consistent.
> + movu m6, [%4]
> + movu m7, [%4+32]
> + movu m8, [%4+64]
> + movu m9, [%4+96]
> +
> + paddw m0, m6, [%1+0 ]
> + paddw m1, m7, [%1+%2 ]
> + paddw m2, m8, [%1+%2*2]
> + paddw m3, m9, [%1+%3 ]
Same as the SSE2 versions.
> +
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> + CLIPW m2, m4, m5
> + CLIPW m3, m4, m5
> + movu [%1+0 ], m0
> + movu [%1+%2 ], m1
> + movu [%1+%2*2], m2
> + movu [%1+%3 ], m3
> +%endmacro
> +
> +%macro TRANS_ADD32_AVX2 3
> + movu m6, [%3]
> + movu m7, [%3+32]
> + movu m8, [%3+64]
> + movu m9, [%3+96]
> +
> + paddw m0, m6, [%1 ]
> + paddw m1, m7, [%1+32 ]
> + paddw m2, m8, [%1+%2 ]
> + paddw m3, m9, [%1+%2+32]
> +
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> + CLIPW m2, m4, m5
> + CLIPW m3, m4, m5
> + movu [%1 ], m0
> + movu [%1+32 ], m1
> + movu [%1+%2 ], m2
> + movu [%1+%2+32], m3
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +cglobal hevc_transform_add4_10,3,4, 6
> + pxor m2, m2
> + mova m3, [max_pixels_10]
> + TR_ADD_MMX_10 r0, r2, r1
> + lea r1, [r1+16]
> + lea r0, [r0+2*r2]
> + TR_ADD_MMX_10 r0, r2, r1
> + RET
> +
> +;-----------------------------------------------------------------------------
> +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
> +;-----------------------------------------------------------------------------
> +%macro TR_ADD8 0
> +cglobal hevc_transform_add8_10,3,4,7
> + pxor m4, m4
> + mova m5, [max_pixels_10]
> + lea r3, [r2*3]
> +
> + TR_ADD_OP_10 r0, r2, r3, r1
> + lea r0, [r0+r2*4]
> + lea r1, [r1+64]
> + TR_ADD_OP_10 r0, r2, r3, r1
> + RET
> +%endmacro
> +
> +%macro TRANS_ADD16 0
> +cglobal hevc_transform_add16_10,3,4,7
> + pxor m4, m4
> + mova m5, [max_pixels_10]
> +
> + TRANS_ADD16_10 r0, r2, r1
> +%rep 7
> + lea r0, [r0+r2*2]
> + lea r1, [r1+64]
> + TRANS_ADD16_10 r0, r2, r1
> +%endrep
> + RET
> +%endmacro
> +
> +
> +%macro TRANS_ADD32 0
> +cglobal hevc_transform_add32_10,3,4,7
> + pxor m4, m4
> + mova m5, [max_pixels_10]
> +
> + TRANS_ADD32_10 r0, r1
> +%rep 31
> + lea r0, [r0+r2]
> + lea r1, [r1+64]
> + TRANS_ADD32_10 r0, r1
> +%endrep
> + RET
> +%endmacro
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +TR_ADD8
> +TRANS_ADD16
> +TRANS_ADD32
> +%if HAVE_AVX_EXTERNAL
> +INIT_XMM avx
> +TR_ADD8
> +TRANS_ADD16
> +TRANS_ADD32
> +%endif
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +
> +cglobal hevc_transform_add16_10,3,4,10
> + pxor m4, m4
> + mova m5, [max_pixels_10]
> + lea r3, [r2*3]
> +
> + TRANS_ADD16_AVX2 r0, r2, r3, r1
> +%rep 3
> + lea r0, [r0+r2*4]
> + lea r1, [r1+128]
> + TRANS_ADD16_AVX2 r0, r2, r3, r1
> +%endrep
> + RET
> +
> +cglobal hevc_transform_add32_10,3,4,10
> + pxor m4, m4
> + mova m5, [max_pixels_10]
> +
> + TRANS_ADD32_AVX2 r0, r2, r1
> +%rep 15
> + lea r0, [r0+r2*2]
> + lea r1, [r1+128]
> + TRANS_ADD32_AVX2 r0, r2, r1
> +%endrep
> + RET
> +%endif ;HAVE_AVX_EXTERNAL
> +%endif ;ARCH_X86_64
> +
> diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
> index 4bcc8dc..5c3a51c 100644
> --- a/libavcodec/x86/hevcdsp.h
> +++ b/libavcodec/x86/hevcdsp.h
Not part of the review, but is this header necessary at all? Only one file
includes it, so all these prototypes could very well be there instead.
> @@ -131,4 +131,25 @@ WEIGHTING_PROTOTYPES(8, sse4);
> WEIGHTING_PROTOTYPES(10, sse4);
> WEIGHTING_PROTOTYPES(12, sse4);
>
> +///////////////////////////////////////////////////////////////////////////////
> +// TRANSFORM_ADD
> +///////////////////////////////////////////////////////////////////////////////
> +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +
> +void ff_hevc_transform_add8_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> #endif // AVCODEC_X86_HEVCDSP_H
> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
> index 828c081..0f3ad7c 100644
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -469,6 +469,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> if (EXTERNAL_MMXEXT(cpu_flags)) {
> c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
> c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
> + c->transform_add[0] = ff_hevc_transform_add4_8_mmxext;
> }
> if (EXTERNAL_SSE2(cpu_flags)) {
> c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
> @@ -476,8 +477,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> if (ARCH_X86_64) {
> c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
> c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
> + c->transform_add[1] = ff_hevc_transform_add8_8_sse2;
> + c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
> + c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
> }
> -
> c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
> c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
> c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
> @@ -512,6 +515,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> }
> } else if (bit_depth == 10) {
> if (EXTERNAL_MMXEXT(cpu_flags)) {
> + c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
> c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
> c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
> }
> @@ -521,6 +525,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> if (ARCH_X86_64) {
> c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
> c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
> + c->transform_add[1] = ff_hevc_transform_add8_8_sse2;
> + c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
> + c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
You're using the 8bit functions instead of 10bit here.
> }
>
> c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
> @@ -548,11 +555,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> if (ARCH_X86_64) {
> c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
> c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
> + c->transform_add[1] = ff_hevc_transform_add8_10_avx;
> + c->transform_add[2] = ff_hevc_transform_add16_10_avx;
> + c->transform_add[3] = ff_hevc_transform_add32_10_avx;
> }
> }
> if (EXTERNAL_AVX2(cpu_flags)) {
> +
> c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
> c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
> + if (ARCH_X86_64) {
> + c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
> + c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
> + }
>
> }
> } else if (bit_depth == 12) {
I suppose adding 12bit versions should be pretty easy, but no hurry since i doubt there are
any such files out there outside the conformance suit :P
More information about the ffmpeg-devel
mailing list