[FFmpeg-cvslog] Merge commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b'
Clément Bœsch
git at videolan.org
Fri Mar 24 13:34:19 EET 2017
ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Fri Mar 24 12:29:21 2017 +0100| [3d6535983282bea542dac2e568ae50da5796be34] | committer: Clément Bœsch
Merge commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b'
* commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b':
hevc: x86: Add add_residual() SIMD optimizations
See a6af4bf64dae46356a5f91537a1c8c5f86456b37
This merge is only cosmetics (renames, space shuffling, etc).
The functionnal changes in the ASM are *not* merged:
- unrolling with %rep is kept
- ADD_RES_MMX_4_8 is left untouched: this needs investigation
Merged-by: Clément Bœsch <u at pkh.me>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3d6535983282bea542dac2e568ae50da5796be34
---
doc/libav-merge.txt | 1 +
libavcodec/hevcdsp.h | 2 +-
libavcodec/x86/hevc_add_res.asm | 269 +++++++++++++++++++---------------------
libavcodec/x86/hevcdsp.h | 29 ++---
libavcodec/x86/hevcdsp_init.c | 30 ++---
5 files changed, 163 insertions(+), 168 deletions(-)
diff --git a/doc/libav-merge.txt b/doc/libav-merge.txt
index d57b79a..44547c9 100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@ -97,6 +97,7 @@ Stuff that didn't reach the codebase:
- VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
- Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
- Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3)
+- ADD_RES_MMX_4_8 in libavcodec/x86/hevc_add_res.asm probably needs updating (see 589880710)
Collateral damage that needs work locally:
------------------------------------------
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 3b7e737..eefb3cd 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -46,7 +46,7 @@ typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
struct GetBitContext *gb, int pcm_bit_depth);
- void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+ void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void (*dequant)(int16_t *coeffs, int16_t log2_size);
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
index 869288f..1ea15df 100644
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -1,4 +1,4 @@
-; /*
+; *****************************************************************************
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
@@ -17,7 +17,8 @@
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-; */
+; ******************************************************************************
+
%include "libavutil/x86/x86util.asm"
SECTION .text
@@ -25,9 +26,8 @@ SECTION .text
cextern pw_1023
%define max_pixels_10 pw_1023
-
-;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
-%macro TR_ADD_MMX_4_8 0
+; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
+%macro ADD_RES_MMX_4_8 0
mova m2, [r1]
mova m4, [r1+8]
pxor m3, m3
@@ -39,27 +39,27 @@ cextern pw_1023
packuswb m4, m4
packuswb m5, m5
- movh m0, [r0 ]
- movh m1, [r0+r2 ]
+ movh m0, [r0]
+ movh m1, [r0+r2]
paddusb m0, m2
paddusb m1, m4
psubusb m0, m3
psubusb m1, m5
- movh [r0 ], m0
- movh [r0+r2 ], m1
+ movh [r0], m0
+ movh [r0+r2], m1
%endmacro
INIT_MMX mmxext
-; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual4_8, 3, 4, 6
- TR_ADD_MMX_4_8
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+ ADD_RES_MMX_4_8
add r1, 16
lea r0, [r0+r2*2]
- TR_ADD_MMX_4_8
+ ADD_RES_MMX_4_8
RET
-%macro TR_ADD_SSE_8_8 0
+%macro ADD_RES_SSE_8_8 0
pxor m3, m3
mova m4, [r1]
mova m6, [r1+16]
@@ -74,22 +74,22 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m6, m2
packuswb m7, m3
- movq m0, [r0 ]
- movq m1, [r0+r2 ]
- movhps m0, [r0+r2*2]
- movhps m1, [r0+r3 ]
- paddusb m0, m4
- paddusb m1, m6
- psubusb m0, m5
- psubusb m1, m7
- movq [r0 ], m0
- movq [r0+r2 ], m1
- movhps [r0+2*r2], m0
- movhps [r0+r3 ], m1
+ movq m0, [r0]
+ movq m1, [r0+r2]
+ movhps m0, [r0+r2*2]
+ movhps m1, [r0+r3]
+ paddusb m0, m4
+ paddusb m1, m6
+ psubusb m0, m5
+ psubusb m1, m7
+ movq [r0], m0
+ movq [r0+r2], m1
+ movhps [r0+2*r2], m0
+ movhps [r0+r3], m1
%endmacro
-%macro TR_ADD_SSE_16_32_8 3
- mova xm2, [r1+%1 ]
+%macro ADD_RES_SSE_16_32_8 3
+ mova xm2, [r1+%1]
mova xm6, [r1+%1+16]
%if cpuflag(avx2)
vinserti128 m2, m2, [r1+%1+32], 1
@@ -107,7 +107,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m2, m6
packuswb m1, m5
- mova xm4, [r1+%1+mmsize*2 ]
+ mova xm4, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2)
vinserti128 m4, m4, [r1+%1+96 ], 1
@@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6
%macro TRANSFORM_ADD_8 0
-; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual8_8, 3, 4, 8
+; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_8_8, 3, 4, 8
lea r3, [r2*3]
- TR_ADD_SSE_8_8
+ ADD_RES_SSE_8_8
add r1, 64
lea r0, [r0+r2*4]
- TR_ADD_SSE_8_8
+ ADD_RES_SSE_8_8
RET
-; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual16_8, 3, 4, 7
- pxor m0, m0
- lea r3, [r2*3]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_16_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
%rep 3
- add r1, 128
- lea r0, [r0+r2*4]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+ add r1, 128
+ lea r0, [r0+r2*4]
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
%endrep
RET
-; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual32_8, 3, 4, 7
- pxor m0, m0
- TR_ADD_SSE_16_32_8 0, r0, r0+16
- TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 4, 7
+ pxor m0, m0
+ ADD_RES_SSE_16_32_8 0, r0, r0+16
+ ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
%rep 15
- add r1, 128
- lea r0, [r0+r2*2]
- TR_ADD_SSE_16_32_8 0, r0, r0+16
- TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+ add r1, 128
+ lea r0, [r0+r2*2]
+ ADD_RES_SSE_16_32_8 0, r0, r0+16
+ ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
%endrep
RET
%endmacro
@@ -179,80 +179,77 @@ TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual32_8, 3, 4, 7
- pxor m0, m0
- lea r3, [r2*3]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
%rep 7
- add r1, 256
- lea r0, [r0+r2*4]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+ add r1, 256
+ lea r0, [r0+r2*4]
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
%endrep
RET
%endif
-;-----------------------------------------------------------------------------
-; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
-%macro TR_ADD_SSE_8_10 4
+%macro ADD_RES_SSE_8_10 4
mova m0, [%4]
mova m1, [%4+16]
mova m2, [%4+32]
mova m3, [%4+48]
- paddw m0, [%1+0 ]
- paddw m1, [%1+%2 ]
+ paddw m0, [%1+0]
+ paddw m1, [%1+%2]
paddw m2, [%1+%2*2]
- paddw m3, [%1+%3 ]
+ paddw m3, [%1+%3]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
- mova [%1+0 ], m0
- mova [%1+%2 ], m1
+ mova [%1+0], m0
+ mova [%1+%2], m1
mova [%1+%2*2], m2
- mova [%1+%3 ], m3
+ mova [%1+%3], m3
%endmacro
-%macro TR_ADD_MMX4_10 3
- mova m0, [%1+0 ]
- mova m1, [%1+%2 ]
+%macro ADD_RES_MMX_4_10 3
+ mova m0, [%1+0]
+ mova m1, [%1+%2]
paddw m0, [%3]
paddw m1, [%3+8]
CLIPW m0, m2, m3
CLIPW m1, m2, m3
- mova [%1+0 ], m0
- mova [%1+%2 ], m1
+ mova [%1+0], m0
+ mova [%1+%2], m1
%endmacro
-%macro TRANS_ADD_SSE_16_10 3
+%macro ADD_RES_SSE_16_10 3
mova m0, [%3]
mova m1, [%3+16]
mova m2, [%3+32]
mova m3, [%3+48]
- paddw m0, [%1 ]
- paddw m1, [%1+16 ]
- paddw m2, [%1+%2 ]
+ paddw m0, [%1]
+ paddw m1, [%1+16]
+ paddw m2, [%1+%2]
paddw m3, [%1+%2+16]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
- mova [%1 ], m0
- mova [%1+16 ], m1
- mova [%1+%2 ], m2
+ mova [%1], m0
+ mova [%1+16], m1
+ mova [%1+%2], m2
mova [%1+%2+16], m3
%endmacro
-%macro TRANS_ADD_SSE_32_10 2
+%macro ADD_RES_SSE_32_10 2
mova m0, [%2]
mova m1, [%2+16]
mova m2, [%2+32]
mova m3, [%2+48]
- paddw m0, [%1 ]
+ paddw m0, [%1]
paddw m1, [%1+16]
paddw m2, [%1+32]
paddw m3, [%1+48]
@@ -260,129 +257,125 @@ cglobal hevc_add_residual32_8, 3, 4, 7
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
- mova [%1 ], m0
+ mova [%1], m0
mova [%1+16], m1
mova [%1+32], m2
mova [%1+48], m3
%endmacro
-%macro TRANS_ADD16_AVX2 4
+%macro ADD_RES_AVX2_16_10 4
mova m0, [%4]
mova m1, [%4+32]
mova m2, [%4+64]
mova m3, [%4+96]
- paddw m0, [%1+0 ]
- paddw m1, [%1+%2 ]
+ paddw m0, [%1+0]
+ paddw m1, [%1+%2]
paddw m2, [%1+%2*2]
- paddw m3, [%1+%3 ]
+ paddw m3, [%1+%3]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
- mova [%1+0 ], m0
- mova [%1+%2 ], m1
+ mova [%1+0], m0
+ mova [%1+%2], m1
mova [%1+%2*2], m2
- mova [%1+%3 ], m3
+ mova [%1+%3], m3
%endmacro
-%macro TRANS_ADD32_AVX2 3
+%macro ADD_RES_AVX2_32_10 3
mova m0, [%3]
mova m1, [%3+32]
mova m2, [%3+64]
mova m3, [%3+96]
- paddw m0, [%1 ]
- paddw m1, [%1+32 ]
- paddw m2, [%1+%2 ]
+ paddw m0, [%1]
+ paddw m1, [%1+32]
+ paddw m2, [%1+%2]
paddw m3, [%1+%2+32]
CLIPW m0, m4, m5
CLIPW m1, m4, m5
CLIPW m2, m4, m5
CLIPW m3, m4, m5
- mova [%1 ], m0
- mova [%1+32 ], m1
- mova [%1+%2 ], m2
+ mova [%1], m0
+ mova [%1+32], m1
+ mova [%1+%2], m2
mova [%1+%2+32], m3
%endmacro
-
+; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext
-cglobal hevc_add_residual4_10,3,4, 6
+cglobal hevc_add_residual_4_10, 3, 4, 6
pxor m2, m2
mova m3, [max_pixels_10]
- TR_ADD_MMX4_10 r0, r2, r1
+ ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16
lea r0, [r0+2*r2]
- TR_ADD_MMX4_10 r0, r2, r1
+ ADD_RES_MMX_4_10 r0, r2, r1
RET
-;-----------------------------------------------------------------------------
-; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal hevc_add_residual8_10,3,4,6
+cglobal hevc_add_residual_8_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
- TR_ADD_SSE_8_10 r0, r2, r3, r1
+ ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 64
- TR_ADD_SSE_8_10 r0, r2, r3, r1
+ ADD_RES_SSE_8_10 r0, r2, r3, r1
RET
-cglobal hevc_add_residual16_10,3,4,6
+cglobal hevc_add_residual_16_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
- TRANS_ADD_SSE_16_10 r0, r2, r1
+ ADD_RES_SSE_16_10 r0, r2, r1
%rep 7
- lea r0, [r0+r2*2]
- add r1, 64
- TRANS_ADD_SSE_16_10 r0, r2, r1
+ lea r0, [r0+r2*2]
+ add r1, 64
+ ADD_RES_SSE_16_10 r0, r2, r1
%endrep
RET
-cglobal hevc_add_residual32_10,3,4,6
+cglobal hevc_add_residual_32_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
- TRANS_ADD_SSE_32_10 r0, r1
+ ADD_RES_SSE_32_10 r0, r1
%rep 31
- lea r0, [r0+r2]
- add r1, 64
- TRANS_ADD_SSE_32_10 r0, r1
+ lea r0, [r0+r2]
+ add r1, 64
+ ADD_RES_SSE_32_10 r0, r1
%endrep
RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
+cglobal hevc_add_residual_16_10, 3, 4, 6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+ lea r3, [r2*3]
-cglobal hevc_add_residual16_10,3,4,6
- pxor m4, m4
- mova m5, [max_pixels_10]
- lea r3, [r2*3]
-
- TRANS_ADD16_AVX2 r0, r2, r3, r1
+ ADD_RES_AVX2_16_10 r0, r2, r3, r1
%rep 3
- lea r0, [r0+r2*4]
- add r1, 128
- TRANS_ADD16_AVX2 r0, r2, r3, r1
+ lea r0, [r0+r2*4]
+ add r1, 128
+ ADD_RES_AVX2_16_10 r0, r2, r3, r1
%endrep
RET
-cglobal hevc_add_residual32_10,3,4,6
- pxor m4, m4
- mova m5, [max_pixels_10]
+cglobal hevc_add_residual_32_10, 3, 4, 6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
- TRANS_ADD32_AVX2 r0, r2, r1
+ ADD_RES_AVX2_32_10 r0, r2, r1
%rep 15
- lea r0, [r0+r2*2]
- add r1, 128
- TRANS_ADD32_AVX2 r0, r2, r1
+ lea r0, [r0+r2*2]
+ add r1, 128
+ ADD_RES_AVX2_32_10 r0, r2, r1
%endrep
RET
-%endif ;HAVE_AVX_EXTERNAL
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 63a148e..67be0a9 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -236,23 +236,24 @@ WEIGHTING_PROTOTYPES(12, sse4);
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////
-void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 0b17671..17cd233 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -713,7 +713,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
- c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
+
+ c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@@ -734,9 +735,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_sse2;
c->idct[1] = ff_hevc_idct_8x8_8_sse2;
- c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
- c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
- c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
+ c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
+ c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
if(ARCH_X86_64) {
@@ -772,9 +773,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
- c->add_residual[1] = ff_hevc_add_residual8_8_avx;
- c->add_residual[2] = ff_hevc_add_residual16_8_avx;
- c->add_residual[3] = ff_hevc_add_residual32_8_avx;
+ c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
+ c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
@@ -874,11 +875,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
- c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
+ c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
}
@@ -902,9 +903,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_10_sse2;
c->idct[1] = ff_hevc_idct_8x8_10_sse2;
- c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
- c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
- c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
+ c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
+ c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
+ c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -1090,9 +1091,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
SAO_BAND_INIT(10, avx2);
SAO_EDGE_INIT(10, avx2);
- c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
- c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
-
+ c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+ c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
}
} else if (bit_depth == 12) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
======================================================================
diff --cc doc/libav-merge.txt
index d57b79a,0000000..44547c9
mode 100644,000000..100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@@ -1,114 -1,0 +1,115 @@@
+CONTEXT
+=======
+
+The FFmpeg project merges all the changes from the Libav project
+(https://libav.org) since the origin of the fork (around 2011).
+
+With the exceptions of some commits due to technical/political disagreements or
+issues, the changes are merged on a more or less regular schedule (daily for
+years thanks to Michael, but more sparse nowadays).
+
+WHY
+===
+
+The majority of the active developers believe the project needs to keep this
+policy for various reasons.
+
+The most important one is that we don't want our users to have to choose
+between two distributors of libraries of the exact same name in order to have a
+different set of features and bugfixes. By taking the responsibility of
+unifying the two codebases, we allow users to benefit from the changes from the
+two teams.
+
+Today, FFmpeg has a much larger user database (we are distributed by every
+major distribution), so we consider this mission a priority.
+
+A different approach to the merge could have been to pick the changes we are
+interested in and drop most of the cosmetics and other less important changes.
+Unfortunately, this makes the following picks much harder, especially since the
+Libav project is involved in various deep API changes. As a result, we decide
+to virtually take everything done there.
+
+Any Libav developer is of course welcome anytime to contribute directly to the
+FFmpeg tree. Of course, we fully understand and are forced to accept that very
+few Libav developers are interested in doing so, but we still want to recognize
+their work. This leads us to create merge commits for every single one from
+Libav. The original commit appears totally unchanged with full authorship in
+our history (and the conflict are solved in the merge one). That way, not a
+single thing from Libav will be lost in the future in case some reunification
+happens, or that project disappears one way or another.
+
+DOWNSIDES
+=========
+
+Of course, there are many downsides to this approach.
+
+- It causes a non negligible merge commits pollution. We make sure there are
+ not several level of merges entangled (we do a 1:1 merge/commit), but it's
+ still a non-linear history.
+
+- Many duplicated work. For instance, we added libavresample in our tree to
+ keep compatibility with Libav when our libswresample was already covering the
+ exact same purpose. The same thing happened for various elements such as the
+ ProRes support (but differences in features, bugs, licenses, ...). There are
+ many work to do to unify them, and any help is very much welcome.
+
+- So much manpower from both FFmpeg and Libav is lost because of this mess. We
+ know it, and we don't know how to fix it. It takes incredible time to do
+ these merges, so we have even less time to work on things we personally care
+ about. The bad vibes also do not help with keeping our developers motivated.
+
+- There is a growing technical risk factor with the merges due to the codebase
+ differing more and more.
+
+MERGE GUIDELINES
+================
+
+The following gives developer guidelines on how to proceed when merging Libav commits.
+
+Before starting, you can reduce the risk of errors on merge conflicts by using
+a different merge conflict style:
+
+ $ git config --global merge.conflictstyle diff3
+
+tools/libav-merge-next-commit is a script to help merging the next commit in
+the queue. It assumes a remote named libav. It has two modes: merge, and noop.
+The noop mode creates a merge with no change to the HEAD. You can pass a hash
+as extra argument to reference a justification (it is common that we already
+have the change done in FFmpeg).
+
+Also see tools/murge, you can copy and paste a 3 way conflict into its stdin
+and it will display colored diffs. Any arguments to murge (like ones to suppress
+whitespace differences) are passed into colordiff.
+
+TODO/FIXME/UNMERGED
+===================
+
+Stuff that didn't reach the codebase:
+-------------------------------------
+
+- HEVC DSP and x86 MC SIMD improvements from Libav (see https://ffmpeg.org/pipermail/ffmpeg-devel/2015-December/184777.html)
+ - 1f821750f hevcdsp: split the qpel functions by width instead of by the subpixel fraction
+ - 818bfe7f0 hevcdsp: split the epel functions by width
+ - 688417399 hevcdsp: split the pred functions by width
+ - a853388d2 hevc: change the stride of the MC buffer to be in bytes instead of elements
+ - 0cef06df0 checkasm: add HEVC MC tests
+ - e7078e842 hevcdsp: add x86 SIMD for MC
+- VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
+- Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
+- Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3)
++- ADD_RES_MMX_4_8 in libavcodec/x86/hevc_add_res.asm probably needs updating (see 589880710)
+
+Collateral damage that needs work locally:
+------------------------------------------
+
+- Merge proresdec2.c and proresdec_lgpl.c
+- Merge proresenc_anatoliy.c and proresenc_kostya.c
+- Remove ADVANCED_PARSER in libavcodec/hevc_parser.c
+- Fix MIPS AC3 downmix
+
+Extra changes needed to be aligned with Libav:
+----------------------------------------------
+
+- Switching our examples to the new encode/decode API (see 67d28f4a0f)
+- AC3 speed-up for our fixed version (see a9ba59591e)
+- HEVC IDCT bit depth 12-bit support (Libav added 8 and 10 but doesn't have 12)
diff --cc libavcodec/hevcdsp.h
index 3b7e737,49cb711..eefb3cd
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@@ -43,82 -39,77 +43,82 @@@ typedef struct SAOParams
} SAOParams;
typedef struct HEVCDSPContext {
- void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
- GetBitContext *gb, int pcm_bit_depth);
+ void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+ struct GetBitContext *gb, int pcm_bit_depth);
- void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+ void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
- void (*dequant)(int16_t *coeffs);
+ void (*dequant)(int16_t *coeffs, int16_t log2_size);
+
+ void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
+
void (*transform_4x4_luma)(int16_t *coeffs);
+
void (*idct[4])(int16_t *coeffs, int col_limit);
+
void (*idct_dc[4])(int16_t *coeffs);
- void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
- struct SAOParams *sao, int *borders,
- int width, int height, int c_idx);
- void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
- struct SAOParams *sao, int *borders, int width,
- int height, int c_idx, uint8_t vert_edge,
- uint8_t horiz_edge, uint8_t diag_edge);
-
- void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
- ptrdiff_t srcstride, int height,
- int mx, int my, int16_t *mcbuffer);
- void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
- ptrdiff_t srcstride, int height,
- int mx, int my, int16_t *mcbuffer);
-
- void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
- uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
- uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
- int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
- ptrdiff_t dststride, int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
- int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
- ptrdiff_t dststride, int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
+ void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+ void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+
+ void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+ void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+ void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int wx1,
+ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+
+ void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int ox0, int wx1,
+ int ox1, intptr_t mx, intptr_t my, int width);
void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p, uint8_t *no_q);
+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p, uint8_t *no_q);
+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p,
+ int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p,
+ int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
} HEVCDSPContext;
diff --cc libavcodec/x86/hevc_add_res.asm
index 869288f,66b929c..1ea15df
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@@ -15,51 -15,49 +15,51 @@@
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
-; * License along with Libav; if not, write to the Free Software
+; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ; */
+ ; ******************************************************************************
+
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA 32
-max_pixels_10: times 16 dw ((1 << 10)-1)
-
SECTION .text
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
-
- ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
- %macro TR_ADD_MMX_4_8 0
+ ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
+ %macro ADD_RES_MMX_4_8 0
- mova m0, [r1]
- mova m2, [r1+8]
- pxor m1, m1
+ mova m2, [r1]
+ mova m4, [r1+8]
pxor m3, m3
- psubw m1, m0
psubw m3, m2
- packuswb m0, m2
- packuswb m1, m3
-
- movd m2, [r0]
- movd m3, [r0+r2]
- punpckldq m2, m3
+ packuswb m2, m2
+ packuswb m3, m3
+ pxor m5, m5
+ psubw m5, m4
+ packuswb m4, m4
+ packuswb m5, m5
+
- movh m0, [r0 ]
- movh m1, [r0+r2 ]
++ movh m0, [r0]
++ movh m1, [r0+r2]
paddusb m0, m2
- psubusb m0, m1
- movd [r0], m0
- psrlq m0, 32
- movd [r0+r2], m0
+ paddusb m1, m4
+ psubusb m0, m3
+ psubusb m1, m5
- movh [r0 ], m0
- movh [r0+r2 ], m1
++ movh [r0], m0
++ movh [r0+r2], m1
%endmacro
INIT_MMX mmxext
- ; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual4_8, 3, 4, 6
- TR_ADD_MMX_4_8
+ ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_4_8, 3, 3, 6
++cglobal hevc_add_residual_4_8, 3, 4, 6
+ ADD_RES_MMX_4_8
add r1, 16
lea r0, [r0+r2*2]
- TR_ADD_MMX_4_8
+ ADD_RES_MMX_4_8
RET
- %macro TR_ADD_SSE_8_8 0
+ %macro ADD_RES_SSE_8_8 0
pxor m3, m3
mova m4, [r1]
mova m6, [r1+16]
@@@ -135,40 -119,40 +135,40 @@@
%macro TRANSFORM_ADD_8 0
- ; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual8_8, 3, 4, 8
+ ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+ cglobal hevc_add_residual_8_8, 3, 4, 8
lea r3, [r2*3]
- TR_ADD_SSE_8_8
+ ADD_RES_SSE_8_8
add r1, 64
lea r0, [r0+r2*4]
- TR_ADD_SSE_8_8
+ ADD_RES_SSE_8_8
RET
- ; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual16_8, 3, 4, 7
- pxor m0, m0
- lea r3, [r2*3]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+ ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_16_8, 3, 5, 7
++cglobal hevc_add_residual_16_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
- mov r4d, 4
-.loop:
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
+%rep 3
- add r1, 128
- lea r0, [r0+r2*4]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+ add r1, 128
+ lea r0, [r0+r2*4]
- dec r4d
- jg .loop
++ ADD_RES_SSE_16_32_8 0, r0, r0+r2
++ ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
+%endrep
RET
- ; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual32_8, 3, 4, 7
- pxor m0, m0
- TR_ADD_SSE_16_32_8 0, r0, r0+16
- TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+ ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_32_8, 3, 5, 7
++cglobal hevc_add_residual_32_8, 3, 4, 7
+ pxor m0, m0
- mov r4d, 16
-.loop:
+ ADD_RES_SSE_16_32_8 0, r0, r0+16
+ ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
+%rep 15
- add r1, 128
- lea r0, [r0+r2*2]
- TR_ADD_SSE_16_32_8 0, r0, r0+16
- TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+ add r1, 128
+ lea r0, [r0+r2*2]
- dec r4d
- jg .loop
++ ADD_RES_SSE_16_32_8 0, r0, r0+16
++ ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
+%endrep
RET
%endmacro
@@@ -179,25 -163,22 +179,22 @@@ TRANSFORM_ADD_
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
- ; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual32_8, 3, 4, 7
- pxor m0, m0
- lea r3, [r2*3]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+ ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_32_8, 3, 5, 7
++cglobal hevc_add_residual_32_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
- mov r4d, 8
-.loop:
+ ADD_RES_SSE_16_32_8 0, r0, r0+r2
+ ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
- add r1, 256
- lea r0, [r0+r2*4]
- TR_ADD_SSE_16_32_8 0, r0, r0+r2
- TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+ add r1, 256
+ lea r0, [r0+r2*4]
- dec r4d
- jg .loop
++ ADD_RES_SSE_16_32_8 0, r0, r0+r2
++ ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
RET
-%endif ;HAVE_AVX2_EXTERNAL
+%endif
- ;-----------------------------------------------------------------------------
- ; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
- ;-----------------------------------------------------------------------------
- %macro TR_ADD_SSE_8_10 4
+ %macro ADD_RES_SSE_8_10 4
mova m0, [%4]
mova m1, [%4+16]
mova m2, [%4+32]
@@@ -308,81 -289,81 +305,77 @@@
mova [%1+%2+32], m3
%endmacro
-
+ ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext
- cglobal hevc_add_residual4_10,3,4, 6
-cglobal hevc_add_residual_4_10, 3, 3, 6
++cglobal hevc_add_residual_4_10, 3, 4, 6
pxor m2, m2
mova m3, [max_pixels_10]
- TR_ADD_MMX4_10 r0, r2, r1
+ ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16
lea r0, [r0+2*r2]
- TR_ADD_MMX4_10 r0, r2, r1
+ ADD_RES_MMX_4_10 r0, r2, r1
RET
- ;-----------------------------------------------------------------------------
- ; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
- ;-----------------------------------------------------------------------------
INIT_XMM sse2
- cglobal hevc_add_residual8_10,3,4,6
+ cglobal hevc_add_residual_8_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
- TR_ADD_SSE_8_10 r0, r2, r3, r1
+ ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 64
- TR_ADD_SSE_8_10 r0, r2, r3, r1
+ ADD_RES_SSE_8_10 r0, r2, r3, r1
RET
- cglobal hevc_add_residual16_10,3,4,6
-cglobal hevc_add_residual_16_10, 3, 5, 6
++cglobal hevc_add_residual_16_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
- TRANS_ADD_SSE_16_10 r0, r2, r1
- mov r4d, 8
-.loop:
+ ADD_RES_SSE_16_10 r0, r2, r1
+%rep 7
- lea r0, [r0+r2*2]
- add r1, 64
- TRANS_ADD_SSE_16_10 r0, r2, r1
+ lea r0, [r0+r2*2]
+ add r1, 64
- dec r4d
- jg .loop
++ ADD_RES_SSE_16_10 r0, r2, r1
+%endrep
RET
- cglobal hevc_add_residual32_10,3,4,6
-cglobal hevc_add_residual_32_10, 3, 5, 6
++cglobal hevc_add_residual_32_10, 3, 4, 6
pxor m4, m4
mova m5, [max_pixels_10]
- TRANS_ADD_SSE_32_10 r0, r1
- mov r4d, 32
-.loop
+ ADD_RES_SSE_32_10 r0, r1
+%rep 31
- lea r0, [r0+r2]
- add r1, 64
- TRANS_ADD_SSE_32_10 r0, r1
+ lea r0, [r0+r2]
+ add r1, 64
- dec r4d
- jg .loop
++ ADD_RES_SSE_32_10 r0, r1
+%endrep
RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-cglobal hevc_add_residual_16_10, 3, 5, 6
++cglobal hevc_add_residual_16_10, 3, 4, 6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+ lea r3, [r2*3]
- cglobal hevc_add_residual16_10,3,4,6
- pxor m4, m4
- mova m5, [max_pixels_10]
- lea r3, [r2*3]
-
- TRANS_ADD16_AVX2 r0, r2, r3, r1
- mov r4d, 4
-.loop
+ ADD_RES_AVX2_16_10 r0, r2, r3, r1
+%rep 3
- lea r0, [r0+r2*4]
- add r1, 128
- TRANS_ADD16_AVX2 r0, r2, r3, r1
+ lea r0, [r0+r2*4]
+ add r1, 128
- dec r4d
- jg .loop
++ ADD_RES_AVX2_16_10 r0, r2, r3, r1
+%endrep
RET
- cglobal hevc_add_residual32_10,3,4,6
- pxor m4, m4
- mova m5, [max_pixels_10]
-cglobal hevc_add_residual_32_10, 3, 5, 6
++cglobal hevc_add_residual_32_10, 3, 4, 6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
- TRANS_ADD32_AVX2 r0, r2, r1
- mov r4d, 16
-.loop
+ ADD_RES_AVX2_32_10 r0, r2, r1
+%rep 15
- lea r0, [r0+r2*2]
- add r1, 128
- TRANS_ADD32_AVX2 r0, r2, r1
+ lea r0, [r0+r2*2]
+ add r1, 128
- dec r4d
- jg .loop
++ ADD_RES_AVX2_32_10 r0, r2, r1
+%endrep
RET
- %endif ;HAVE_AVX_EXTERNAL
+ %endif ;HAVE_AVX2_EXTERNAL
diff --cc libavcodec/x86/hevcdsp.h
index 63a148e,0000000..67be0a9
mode 100644,000000..100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@@ -1,258 -1,0 +1,259 @@@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##6, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+ WEIGHTING_PROTOTYPE(2, bitd, opt); \
+ WEIGHTING_PROTOTYPE(4, bitd, opt); \
+ WEIGHTING_PROTOTYPE(6, bitd, opt); \
+ WEIGHTING_PROTOTYPE(8, bitd, opt); \
+ WEIGHTING_PROTOTYPE(12, bitd, opt); \
+ WEIGHTING_PROTOTYPE(16, bitd, opt); \
+ WEIGHTING_PROTOTYPE(24, bitd, opt); \
+ WEIGHTING_PROTOTYPE(32, bitd, opt); \
+ WEIGHTING_PROTOTYPE(48, bitd, opt); \
+ WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels , 8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h , 8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v , 8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv , 8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h , 8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v, 8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv, 8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
- void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
- void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
- void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
- void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
- void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++
++void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --cc libavcodec/x86/hevcdsp_init.c
index 0b17671,a95fa30..17cd233
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -734,153 -315,37 +735,153 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
c->idct[0] = ff_hevc_idct_4x4_8_sse2;
c->idct[1] = ff_hevc_idct_8x8_8_sse2;
- c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
- c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
- c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
- SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
++ c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
++ c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
++ c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
- SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+ if(ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+ }
+ SAO_EDGE_INIT(8, ssse3);
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
}
if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+ c->idct[2] = ff_hevc_idct_16x16_8_avx;
+ c->idct[3] = ff_hevc_idct_32x32_8_avx;
+ }
+ SAO_BAND_INIT(8, avx);
+
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
+
- c->add_residual[1] = ff_hevc_add_residual8_8_avx;
- c->add_residual[2] = ff_hevc_add_residual16_8_avx;
- c->add_residual[3] = ff_hevc_add_residual32_8_avx;
+ c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
+ c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+ c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+ }
+ SAO_BAND_INIT(8, avx2);
+
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
- c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
+ c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
++ c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
-
- c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
@@@ -901,251 -357,89 +902,250 @@@
c->idct[0] = ff_hevc_idct_4x4_10_sse2;
c->idct[1] = ff_hevc_idct_8x8_10_sse2;
- SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
- c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
- c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
- c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
+ c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
+ c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
+ c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
}
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
+ }
if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+
+ c->idct[2] = ff_hevc_idct_16x16_10_avx;
+ c->idct[3] = ff_hevc_idct_32x32_10_avx;
+ }
+
c->idct[0] = ff_hevc_idct_4x4_10_avx;
c->idct[1] = ff_hevc_idct_8x8_10_avx;
+
+ SAO_BAND_INIT(10, avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {
- c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
- c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
}
- }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
-#if ARCH_X86_64
- if (bit_depth == 8) {
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct[2] = ff_hevc_idct_16x16_8_sse2;
- c->idct[3] = ff_hevc_idct_32x32_8_sse2;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
- }
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
- }
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
- if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
- c->idct[2] = ff_hevc_idct_16x16_8_avx;
- c->idct[3] = ff_hevc_idct_32x32_8_avx;
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+ c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+ c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+ }
+ SAO_BAND_INIT(10, avx2);
+ SAO_EDGE_INIT(10, avx2);
+
- c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
- c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
-
++ c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
++ c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
}
- if (EXTERNAL_AVX2(cpu_flags)) {
- c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
- c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+ } else if (bit_depth == 12) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
}
- } else if (bit_depth == 10) {
if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct[2] = ff_hevc_idct_16x16_10_sse2;
- c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+ }
+ SAO_BAND_INIT(12, sse2);
+ SAO_EDGE_INIT(12, sse2);
+
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
}
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
}
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
}
if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
- SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
- SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
- c->idct[2] = ff_hevc_idct_16x16_10_avx;
- c->idct[3] = ff_hevc_idct_32x32_10_avx;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+ }
+ SAO_BAND_INIT(12, avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {
- c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
- c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
+
+ SAO_BAND_INIT(12, avx2);
+ SAO_EDGE_INIT(12, avx2);
}
}
-#endif /* ARCH_X86_64 */
}
More information about the ffmpeg-cvslog
mailing list