[FFmpeg-cvslog] Merge commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b'

Clément Bœsch git at videolan.org
Fri Mar 24 13:34:19 EET 2017


ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Fri Mar 24 12:29:21 2017 +0100| [3d6535983282bea542dac2e568ae50da5796be34] | committer: Clément Bœsch

Merge commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b'

* commit '6d5636ad9ab6bd9bedf902051d88b7044385f88b':
  hevc: x86: Add add_residual() SIMD optimizations

See a6af4bf64dae46356a5f91537a1c8c5f86456b37

This merge is only cosmetics (renames, space shuffling, etc).

The functionnal changes in the ASM are *not* merged:
- unrolling with %rep is kept
- ADD_RES_MMX_4_8 is left untouched: this needs investigation

Merged-by: Clément Bœsch <u at pkh.me>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3d6535983282bea542dac2e568ae50da5796be34
---

 doc/libav-merge.txt             |   1 +
 libavcodec/hevcdsp.h            |   2 +-
 libavcodec/x86/hevc_add_res.asm | 269 +++++++++++++++++++---------------------
 libavcodec/x86/hevcdsp.h        |  29 ++---
 libavcodec/x86/hevcdsp_init.c   |  30 ++---
 5 files changed, 163 insertions(+), 168 deletions(-)

diff --git a/doc/libav-merge.txt b/doc/libav-merge.txt
index d57b79a..44547c9 100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@ -97,6 +97,7 @@ Stuff that didn't reach the codebase:
 - VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
 - Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
 - Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3)
+- ADD_RES_MMX_4_8 in libavcodec/x86/hevc_add_res.asm probably needs updating (see 589880710)
 
 Collateral damage that needs work locally:
 ------------------------------------------
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 3b7e737..eefb3cd 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -46,7 +46,7 @@ typedef struct HEVCDSPContext {
     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                     struct GetBitContext *gb, int pcm_bit_depth);
 
-    void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
     void (*dequant)(int16_t *coeffs, int16_t log2_size);
 
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
index 869288f..1ea15df 100644
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -1,4 +1,4 @@
-; /*
+; *****************************************************************************
 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
@@ -17,7 +17,8 @@
 ; * You should have received a copy of the GNU Lesser General Public
 ; * License along with FFmpeg; if not, write to the Free Software
 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-; */
+; ******************************************************************************
+
 %include "libavutil/x86/x86util.asm"
 
 SECTION .text
@@ -25,9 +26,8 @@ SECTION .text
 cextern pw_1023
 %define max_pixels_10 pw_1023
 
-
-;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
-%macro TR_ADD_MMX_4_8 0
+; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
+%macro ADD_RES_MMX_4_8 0
     mova              m2, [r1]
     mova              m4, [r1+8]
     pxor              m3, m3
@@ -39,27 +39,27 @@ cextern pw_1023
     packuswb          m4, m4
     packuswb          m5, m5
 
-    movh              m0, [r0     ]
-    movh              m1, [r0+r2  ]
+    movh              m0, [r0]
+    movh              m1, [r0+r2]
     paddusb           m0, m2
     paddusb           m1, m4
     psubusb           m0, m3
     psubusb           m1, m5
-    movh       [r0     ], m0
-    movh       [r0+r2  ], m1
+    movh            [r0], m0
+    movh         [r0+r2], m1
 %endmacro
 
 
 INIT_MMX mmxext
-; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual4_8, 3, 4, 6
-    TR_ADD_MMX_4_8
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+    ADD_RES_MMX_4_8
     add               r1, 16
     lea               r0, [r0+r2*2]
-    TR_ADD_MMX_4_8
+    ADD_RES_MMX_4_8
     RET
 
-%macro TR_ADD_SSE_8_8 0
+%macro ADD_RES_SSE_8_8 0
     pxor              m3, m3
     mova              m4, [r1]
     mova              m6, [r1+16]
@@ -74,22 +74,22 @@ cglobal hevc_add_residual4_8, 3, 4, 6
     packuswb          m6, m2
     packuswb          m7, m3
 
-    movq                m0, [r0     ]
-    movq                m1, [r0+r2  ]
-    movhps              m0, [r0+r2*2]
-    movhps              m1, [r0+r3  ]
-    paddusb             m0, m4
-    paddusb             m1, m6
-    psubusb             m0, m5
-    psubusb             m1, m7
-    movq         [r0     ], m0
-    movq         [r0+r2  ], m1
-    movhps       [r0+2*r2], m0
-    movhps       [r0+r3  ], m1
+    movq              m0, [r0]
+    movq              m1, [r0+r2]
+    movhps            m0, [r0+r2*2]
+    movhps            m1, [r0+r3]
+    paddusb           m0, m4
+    paddusb           m1, m6
+    psubusb           m0, m5
+    psubusb           m1, m7
+    movq            [r0], m0
+    movq         [r0+r2], m1
+    movhps     [r0+2*r2], m0
+    movhps       [r0+r3], m1
 %endmacro
 
-%macro TR_ADD_SSE_16_32_8 3
-    mova             xm2, [r1+%1   ]
+%macro ADD_RES_SSE_16_32_8 3
+    mova             xm2, [r1+%1]
     mova             xm6, [r1+%1+16]
 %if cpuflag(avx2)
     vinserti128       m2, m2, [r1+%1+32], 1
@@ -107,7 +107,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6
     packuswb          m2, m6
     packuswb          m1, m5
 
-    mova             xm4, [r1+%1+mmsize*2   ]
+    mova             xm4, [r1+%1+mmsize*2]
     mova             xm6, [r1+%1+mmsize*2+16]
 %if cpuflag(avx2)
     vinserti128       m4, m4, [r1+%1+96 ], 1
@@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6
 
 
 %macro TRANSFORM_ADD_8 0
-; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual8_8, 3, 4, 8
+; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_8_8, 3, 4, 8
     lea               r3, [r2*3]
-    TR_ADD_SSE_8_8
+    ADD_RES_SSE_8_8
     add               r1, 64
     lea               r0, [r0+r2*4]
-    TR_ADD_SSE_8_8
+    ADD_RES_SSE_8_8
     RET
 
-; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual16_8, 3, 4, 7
-    pxor              m0, m0
-    lea               r3, [r2*3]
-    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
-    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_16_8, 3, 4, 7
+    pxor                m0, m0
+    lea                 r3, [r2*3]
+    ADD_RES_SSE_16_32_8  0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
 %rep 3
-    add                r1, 128
-    lea                r0, [r0+r2*4]
-    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
-    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+    add                 r1, 128
+    lea                 r0, [r0+r2*4]
+    ADD_RES_SSE_16_32_8  0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
 %endrep
     RET
 
-; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual32_8, 3, 4, 7
-    pxor               m0, m0
-    TR_ADD_SSE_16_32_8  0, r0,    r0+16
-    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 4, 7
+    pxor                m0, m0
+    ADD_RES_SSE_16_32_8  0, r0,    r0+16
+    ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
 %rep 15
-    add                r1, 128
-    lea                r0, [r0+r2*2]
-    TR_ADD_SSE_16_32_8  0, r0,    r0+16
-    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+    add                 r1, 128
+    lea                 r0, [r0+r2*2]
+    ADD_RES_SSE_16_32_8  0, r0,    r0+16
+    ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
 %endrep
     RET
 %endmacro
@@ -179,80 +179,77 @@ TRANSFORM_ADD_8
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_add_residual32_8, 3, 4, 7
-    pxor              m0, m0
-    lea               r3, [r2*3]
-    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
-    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+cglobal hevc_add_residual_32_8, 3, 4, 7
+    pxor                 m0, m0
+    lea                  r3, [r2*3]
+    ADD_RES_SSE_16_32_8   0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
 %rep 7
-    add                r1, 256
-    lea                r0, [r0+r2*4]
-    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
-    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+    add                  r1, 256
+    lea                  r0, [r0+r2*4]
+    ADD_RES_SSE_16_32_8   0, r0,      r0+r2
+    ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
 %endrep
     RET
 %endif
 
-;-----------------------------------------------------------------------------
-; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
-%macro TR_ADD_SSE_8_10 4
+%macro ADD_RES_SSE_8_10 4
     mova              m0, [%4]
     mova              m1, [%4+16]
     mova              m2, [%4+32]
     mova              m3, [%4+48]
-    paddw             m0, [%1+0   ]
-    paddw             m1, [%1+%2  ]
+    paddw             m0, [%1+0]
+    paddw             m1, [%1+%2]
     paddw             m2, [%1+%2*2]
-    paddw             m3, [%1+%3  ]
+    paddw             m3, [%1+%3]
     CLIPW             m0, m4, m5
     CLIPW             m1, m4, m5
     CLIPW             m2, m4, m5
     CLIPW             m3, m4, m5
-    mova       [%1+0   ], m0
-    mova       [%1+%2  ], m1
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
     mova       [%1+%2*2], m2
-    mova       [%1+%3  ], m3
+    mova         [%1+%3], m3
 %endmacro
 
-%macro TR_ADD_MMX4_10 3
-    mova              m0, [%1+0   ]
-    mova              m1, [%1+%2  ]
+%macro ADD_RES_MMX_4_10 3
+    mova              m0, [%1+0]
+    mova              m1, [%1+%2]
     paddw             m0, [%3]
     paddw             m1, [%3+8]
     CLIPW             m0, m2, m3
     CLIPW             m1, m2, m3
-    mova       [%1+0   ], m0
-    mova       [%1+%2  ], m1
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
 %endmacro
 
-%macro TRANS_ADD_SSE_16_10 3
+%macro ADD_RES_SSE_16_10 3
     mova              m0, [%3]
     mova              m1, [%3+16]
     mova              m2, [%3+32]
     mova              m3, [%3+48]
-    paddw             m0, [%1      ]
-    paddw             m1, [%1+16   ]
-    paddw             m2, [%1+%2   ]
+    paddw             m0, [%1]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+%2]
     paddw             m3, [%1+%2+16]
     CLIPW             m0, m4, m5
     CLIPW             m1, m4, m5
     CLIPW             m2, m4, m5
     CLIPW             m3, m4, m5
-    mova      [%1      ], m0
-    mova      [%1+16   ], m1
-    mova      [%1+%2   ], m2
+    mova            [%1], m0
+    mova         [%1+16], m1
+    mova         [%1+%2], m2
     mova      [%1+%2+16], m3
 %endmacro
 
-%macro TRANS_ADD_SSE_32_10 2
+%macro ADD_RES_SSE_32_10 2
     mova              m0, [%2]
     mova              m1, [%2+16]
     mova              m2, [%2+32]
     mova              m3, [%2+48]
 
-    paddw             m0, [%1   ]
+    paddw             m0, [%1]
     paddw             m1, [%1+16]
     paddw             m2, [%1+32]
     paddw             m3, [%1+48]
@@ -260,129 +257,125 @@ cglobal hevc_add_residual32_8, 3, 4, 7
     CLIPW             m1, m4, m5
     CLIPW             m2, m4, m5
     CLIPW             m3, m4, m5
-    mova         [%1   ], m0
+    mova            [%1], m0
     mova         [%1+16], m1
     mova         [%1+32], m2
     mova         [%1+48], m3
 %endmacro
 
-%macro TRANS_ADD16_AVX2 4
+%macro ADD_RES_AVX2_16_10 4
     mova              m0, [%4]
     mova              m1, [%4+32]
     mova              m2, [%4+64]
     mova              m3, [%4+96]
 
-    paddw             m0, [%1+0   ]
-    paddw             m1, [%1+%2  ]
+    paddw             m0, [%1+0]
+    paddw             m1, [%1+%2]
     paddw             m2, [%1+%2*2]
-    paddw             m3, [%1+%3  ]
+    paddw             m3, [%1+%3]
 
     CLIPW             m0, m4, m5
     CLIPW             m1, m4, m5
     CLIPW             m2, m4, m5
     CLIPW             m3, m4, m5
-    mova       [%1+0   ], m0
-    mova       [%1+%2  ], m1
+    mova          [%1+0], m0
+    mova         [%1+%2], m1
     mova       [%1+%2*2], m2
-    mova       [%1+%3  ], m3
+    mova         [%1+%3], m3
 %endmacro
 
-%macro TRANS_ADD32_AVX2 3
+%macro ADD_RES_AVX2_32_10 3
     mova              m0, [%3]
     mova              m1, [%3+32]
     mova              m2, [%3+64]
     mova              m3, [%3+96]
 
-    paddw             m0, [%1      ]
-    paddw             m1, [%1+32   ]
-    paddw             m2, [%1+%2   ]
+    paddw             m0, [%1]
+    paddw             m1, [%1+32]
+    paddw             m2, [%1+%2]
     paddw             m3, [%1+%2+32]
 
     CLIPW             m0, m4, m5
     CLIPW             m1, m4, m5
     CLIPW             m2, m4, m5
     CLIPW             m3, m4, m5
-    mova      [%1      ], m0
-    mova      [%1+32   ], m1
-    mova      [%1+%2   ], m2
+    mova            [%1], m0
+    mova         [%1+32], m1
+    mova         [%1+%2], m2
     mova      [%1+%2+32], m3
 %endmacro
 
-
+; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
 INIT_MMX mmxext
-cglobal hevc_add_residual4_10,3,4, 6
+cglobal hevc_add_residual_4_10, 3, 4, 6
     pxor              m2, m2
     mova              m3, [max_pixels_10]
-    TR_ADD_MMX4_10     r0, r2, r1
+    ADD_RES_MMX_4_10  r0, r2, r1
     add               r1, 16
     lea               r0, [r0+2*r2]
-    TR_ADD_MMX4_10     r0, r2, r1
+    ADD_RES_MMX_4_10  r0, r2, r1
     RET
 
-;-----------------------------------------------------------------------------
-; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal hevc_add_residual8_10,3,4,6
+cglobal hevc_add_residual_8_10, 3, 4, 6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
     lea               r3, [r2*3]
 
-    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    ADD_RES_SSE_8_10  r0, r2, r3, r1
     lea               r0, [r0+r2*4]
     add               r1, 64
-    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    ADD_RES_SSE_8_10  r0, r2, r3, r1
     RET
 
-cglobal hevc_add_residual16_10,3,4,6
+cglobal hevc_add_residual_16_10, 3, 4, 6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
 
-    TRANS_ADD_SSE_16_10 r0, r2, r1
+    ADD_RES_SSE_16_10 r0, r2, r1
 %rep 7
-    lea                 r0, [r0+r2*2]
-    add                 r1, 64
-    TRANS_ADD_SSE_16_10 r0, r2, r1
+    lea               r0, [r0+r2*2]
+    add               r1, 64
+    ADD_RES_SSE_16_10 r0, r2, r1
 %endrep
     RET
 
-cglobal hevc_add_residual32_10,3,4,6
+cglobal hevc_add_residual_32_10, 3, 4, 6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
 
-    TRANS_ADD_SSE_32_10 r0, r1
+    ADD_RES_SSE_32_10 r0, r1
 %rep 31
-    lea                 r0, [r0+r2]
-    add                 r1, 64
-    TRANS_ADD_SSE_32_10 r0, r1
+    lea               r0, [r0+r2]
+    add               r1, 64
+    ADD_RES_SSE_32_10 r0, r1
 %endrep
     RET
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
+cglobal hevc_add_residual_16_10, 3, 4, 6
+    pxor               m4, m4
+    mova               m5, [max_pixels_10]
+    lea                r3, [r2*3]
 
-cglobal hevc_add_residual16_10,3,4,6
-    pxor              m4, m4
-    mova              m5, [max_pixels_10]
-    lea               r3, [r2*3]
-
-    TRANS_ADD16_AVX2  r0, r2, r3, r1
+    ADD_RES_AVX2_16_10 r0, r2, r3, r1
 %rep 3
-    lea               r0, [r0+r2*4]
-    add               r1, 128
-    TRANS_ADD16_AVX2  r0, r2, r3, r1
+    lea                r0, [r0+r2*4]
+    add                r1, 128
+    ADD_RES_AVX2_16_10 r0, r2, r3, r1
 %endrep
     RET
 
-cglobal hevc_add_residual32_10,3,4,6
-    pxor              m4, m4
-    mova              m5, [max_pixels_10]
+cglobal hevc_add_residual_32_10, 3, 4, 6
+    pxor               m4, m4
+    mova               m5, [max_pixels_10]
 
-    TRANS_ADD32_AVX2  r0, r2, r1
+    ADD_RES_AVX2_32_10 r0, r2, r1
 %rep 15
-    lea               r0, [r0+r2*2]
-    add               r1, 128
-    TRANS_ADD32_AVX2  r0, r2, r1
+    lea                r0, [r0+r2*2]
+    add                r1, 128
+    ADD_RES_AVX2_32_10 r0, r2, r1
 %endrep
     RET
-%endif ;HAVE_AVX_EXTERNAL
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 63a148e..67be0a9 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -236,23 +236,24 @@ WEIGHTING_PROTOTYPES(12, sse4);
 ///////////////////////////////////////////////////////////////////////////////
 // TRANSFORM_ADD
 ///////////////////////////////////////////////////////////////////////////////
-void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
-void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
-void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
-void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
-void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
 #endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 0b17671..17cd233 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -713,7 +713,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
-            c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
+
+            c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@@ -734,9 +735,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
 
-            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
-            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
-            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
+            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
+            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
+            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             if(ARCH_X86_64) {
@@ -772,9 +773,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct[0] = ff_hevc_idct_4x4_8_avx;
             c->idct[1] = ff_hevc_idct_8x8_8_avx;
 
-            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
-            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
-            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
+            c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
+            c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
+            c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
@@ -874,11 +875,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
             c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 
-            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
+            c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
+            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
         }
@@ -902,9 +903,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
 
-            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
-            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
-            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
+            c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
+            c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
+            c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -1090,9 +1091,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             SAO_BAND_INIT(10, avx2);
             SAO_EDGE_INIT(10, avx2);
 
-            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
-            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
-
+            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
         }
     } else if (bit_depth == 12) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {


======================================================================

diff --cc doc/libav-merge.txt
index d57b79a,0000000..44547c9
mode 100644,000000..100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@@ -1,114 -1,0 +1,115 @@@
 +CONTEXT
 +=======
 +
 +The FFmpeg project merges all the changes from the Libav project
 +(https://libav.org) since the origin of the fork (around 2011).
 +
 +With the exceptions of some commits due to technical/political disagreements or
 +issues, the changes are merged on a more or less regular schedule (daily for
 +years thanks to Michael, but more sparse nowadays).
 +
 +WHY
 +===
 +
 +The majority of the active developers believe the project needs to keep this
 +policy for various reasons.
 +
 +The most important one is that we don't want our users to have to choose
 +between two distributors of libraries of the exact same name in order to have a
 +different set of features and bugfixes. By taking the responsibility of
 +unifying the two codebases, we allow users to benefit from the changes from the
 +two teams.
 +
 +Today, FFmpeg has a much larger user database (we are distributed by every
 +major distribution), so we consider this mission a priority.
 +
 +A different approach to the merge could have been to pick the changes we are
 +interested in and drop most of the cosmetics and other less important changes.
 +Unfortunately, this makes the following picks much harder, especially since the
 +Libav project is involved in various deep API changes. As a result, we decide
 +to virtually take everything done there.
 +
 +Any Libav developer is of course welcome anytime to contribute directly to the
 +FFmpeg tree. Of course, we fully understand and are forced to accept that very
 +few Libav developers are interested in doing so, but we still want to recognize
 +their work. This leads us to create merge commits for every single one from
 +Libav. The original commit appears totally unchanged with full authorship in
 +our history (and the conflict are solved in the merge one). That way, not a
 +single thing from Libav will be lost in the future in case some reunification
 +happens, or that project disappears one way or another.
 +
 +DOWNSIDES
 +=========
 +
 +Of course, there are many downsides to this approach.
 +
 +- It causes a non negligible merge commits pollution. We make sure there are
 +  not several level of merges entangled (we do a 1:1 merge/commit), but it's
 +  still a non-linear history.
 +
 +- Many duplicated work. For instance, we added libavresample in our tree to
 +  keep compatibility with Libav when our libswresample was already covering the
 +  exact same purpose. The same thing happened for various elements such as the
 +  ProRes support (but differences in features, bugs, licenses, ...). There are
 +  many work to do to unify them, and any help is very much welcome.
 +
 +- So much manpower from both FFmpeg and Libav is lost because of this mess. We
 +  know it, and we don't know how to fix it. It takes incredible time to do
 +  these merges, so we have even less time to work on things we personally care
 +  about. The bad vibes also do not help with keeping our developers motivated.
 +
 +- There is a growing technical risk factor with the merges due to the codebase
 +  differing more and more.
 +
 +MERGE GUIDELINES
 +================
 +
 +The following gives developer guidelines on how to proceed when merging Libav commits.
 +
 +Before starting, you can reduce the risk of errors on merge conflicts by using
 +a different merge conflict style:
 +
 +    $ git config --global merge.conflictstyle diff3
 +
 +tools/libav-merge-next-commit is a script to help merging the next commit in
 +the queue. It assumes a remote named libav. It has two modes: merge, and noop.
 +The noop mode creates a merge with no change to the HEAD. You can pass a hash
 +as extra argument to reference a justification (it is common that we already
 +have the change done in FFmpeg).
 +
 +Also see tools/murge, you can copy and paste a 3 way conflict into its stdin
 +and it will display colored diffs. Any arguments to murge (like ones to suppress
 +whitespace differences) are passed into colordiff.
 +
 +TODO/FIXME/UNMERGED
 +===================
 +
 +Stuff that didn't reach the codebase:
 +-------------------------------------
 +
 +- HEVC DSP and x86 MC SIMD improvements from Libav (see https://ffmpeg.org/pipermail/ffmpeg-devel/2015-December/184777.html)
 +  - 1f821750f hevcdsp: split the qpel functions by width instead of by the subpixel fraction
 +  - 818bfe7f0 hevcdsp: split the epel functions by width
 +  - 688417399 hevcdsp: split the pred functions by width
 +  - a853388d2 hevc: change the stride of the MC buffer to be in bytes instead of elements
 +  - 0cef06df0 checkasm: add HEVC MC tests
 +  - e7078e842 hevcdsp: add x86 SIMD for MC
 +- VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
 +- Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
 +- Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3)
++- ADD_RES_MMX_4_8 in libavcodec/x86/hevc_add_res.asm probably needs updating (see 589880710)
 +
 +Collateral damage that needs work locally:
 +------------------------------------------
 +
 +- Merge proresdec2.c and proresdec_lgpl.c
 +- Merge proresenc_anatoliy.c and proresenc_kostya.c
 +- Remove ADVANCED_PARSER in libavcodec/hevc_parser.c
 +- Fix MIPS AC3 downmix
 +
 +Extra changes needed to be aligned with Libav:
 +----------------------------------------------
 +
 +- Switching our examples to the new encode/decode API (see 67d28f4a0f)
 +- AC3 speed-up for our fixed version (see a9ba59591e)
 +- HEVC IDCT bit depth 12-bit support (Libav added 8 and 10 but doesn't have 12)
diff --cc libavcodec/hevcdsp.h
index 3b7e737,49cb711..eefb3cd
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@@ -43,82 -39,77 +43,82 @@@ typedef struct SAOParams 
  } SAOParams;
  
  typedef struct HEVCDSPContext {
 -    void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
 -                    GetBitContext *gb, int pcm_bit_depth);
 +    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
 +                    struct GetBitContext *gb, int pcm_bit_depth);
  
-     void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+     void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  
 -    void (*dequant)(int16_t *coeffs);
 +    void (*dequant)(int16_t *coeffs, int16_t log2_size);
 +
 +    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
 +
      void (*transform_4x4_luma)(int16_t *coeffs);
 +
      void (*idct[4])(int16_t *coeffs, int col_limit);
 +
      void (*idct_dc[4])(int16_t *coeffs);
  
 -    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 -                               struct SAOParams *sao, int *borders,
 -                               int width, int height, int c_idx);
 -    void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 -                               struct SAOParams *sao, int *borders, int width,
 -                               int height, int c_idx, uint8_t vert_edge,
 -                               uint8_t horiz_edge, uint8_t diag_edge);
 -
 -    void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
 -                                   ptrdiff_t srcstride, int height,
 -                                   int mx, int my, int16_t *mcbuffer);
 -    void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
 -                                   ptrdiff_t srcstride, int height,
 -                                   int mx, int my, int16_t *mcbuffer);
 -
 -    void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                   ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                          ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
 -                                       int16_t *src1, int16_t *src2,
 -                                       ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
 -                                              int16_t *src1, int16_t *src2,
 -                                              ptrdiff_t srcstride, int height);
 -    void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
 -                             uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                             ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
 -                                    uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                    ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
 -                                 int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
 -                                 ptrdiff_t dststride, int16_t *src1, int16_t *src2,
 -                                 ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
 -                                        int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
 -                                        ptrdiff_t dststride, int16_t *src1, int16_t *src2,
 -                                        ptrdiff_t srcstride, int height);
 +    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
 +    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
 +
 +    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
 +                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
 +
 +    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                    int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                        int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
 +
 +    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                         int16_t *src2,
 +                                         int height, int denom, int wx0, int wx1,
 +                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                    int height, intptr_t mx, intptr_t my, int width);
 +
 +    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                        int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                         int16_t *src2,
 +                                         int height, int denom, int wx0, int ox0, int wx1,
 +                                         int ox1, intptr_t mx, intptr_t my, int width);
  
      void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 -                                    int beta, int *tc,
 +                                    int beta, int32_t *tc,
                                      uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 -                                    int beta, int *tc,
 +                                    int beta, int32_t *tc,
                                      uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 -                                      int *tc, uint8_t *no_p, uint8_t *no_q);
 +                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 -                                      int *tc, uint8_t *no_p, uint8_t *no_q);
 +                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                      int beta, int *tc,
 +                                      int beta, int32_t *tc,
                                        uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                      int beta, int *tc,
 +                                      int beta, int32_t *tc,
                                        uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                        int *tc, uint8_t *no_p,
 +                                        int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                        int *tc, uint8_t *no_p,
 +                                        int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
  } HEVCDSPContext;
  
diff --cc libavcodec/x86/hevc_add_res.asm
index 869288f,66b929c..1ea15df
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@@ -15,51 -15,49 +15,51 @@@
  ; * Lesser General Public License for more details.
  ; *
  ; * You should have received a copy of the GNU Lesser General Public
 -; * License along with Libav; if not, write to the Free Software
 +; * License along with FFmpeg; if not, write to the Free Software
  ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ; */
+ ; ******************************************************************************
+ 
  %include "libavutil/x86/x86util.asm"
  
 -SECTION_RODATA 32
 -max_pixels_10:          times 16  dw ((1 << 10)-1)
 -
  SECTION .text
  
 +cextern pw_1023
 +%define max_pixels_10 pw_1023
 +
- 
- ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
- %macro TR_ADD_MMX_4_8 0
+ ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
+ %macro ADD_RES_MMX_4_8 0
 -    mova              m0, [r1]
 -    mova              m2, [r1+8]
 -    pxor              m1, m1
 +    mova              m2, [r1]
 +    mova              m4, [r1+8]
      pxor              m3, m3
 -    psubw             m1, m0
      psubw             m3, m2
 -    packuswb          m0, m2
 -    packuswb          m1, m3
 -
 -    movd              m2, [r0]
 -    movd              m3, [r0+r2]
 -    punpckldq         m2, m3
 +    packuswb          m2, m2
 +    packuswb          m3, m3
 +    pxor              m5, m5
 +    psubw             m5, m4
 +    packuswb          m4, m4
 +    packuswb          m5, m5
 +
-     movh              m0, [r0     ]
-     movh              m1, [r0+r2  ]
++    movh              m0, [r0]
++    movh              m1, [r0+r2]
      paddusb           m0, m2
 -    psubusb           m0, m1
 -    movd            [r0], m0
 -    psrlq             m0, 32
 -    movd         [r0+r2], m0
 +    paddusb           m1, m4
 +    psubusb           m0, m3
 +    psubusb           m1, m5
-     movh       [r0     ], m0
-     movh       [r0+r2  ], m1
++    movh            [r0], m0
++    movh         [r0+r2], m1
  %endmacro
  
  
  INIT_MMX mmxext
- ; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual4_8, 3, 4, 6
-     TR_ADD_MMX_4_8
+ ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
 -cglobal hevc_add_residual_4_8, 3, 3, 6
++cglobal hevc_add_residual_4_8, 3, 4, 6
+     ADD_RES_MMX_4_8
      add               r1, 16
      lea               r0, [r0+r2*2]
-     TR_ADD_MMX_4_8
+     ADD_RES_MMX_4_8
      RET
  
- %macro TR_ADD_SSE_8_8 0
+ %macro ADD_RES_SSE_8_8 0
      pxor              m3, m3
      mova              m4, [r1]
      mova              m6, [r1+16]
@@@ -135,40 -119,40 +135,40 @@@
  
  
  %macro TRANSFORM_ADD_8 0
- ; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual8_8, 3, 4, 8
+ ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
+ cglobal hevc_add_residual_8_8, 3, 4, 8
      lea               r3, [r2*3]
-     TR_ADD_SSE_8_8
+     ADD_RES_SSE_8_8
      add               r1, 64
      lea               r0, [r0+r2*4]
-     TR_ADD_SSE_8_8
+     ADD_RES_SSE_8_8
      RET
  
- ; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual16_8, 3, 4, 7
-     pxor              m0, m0
-     lea               r3, [r2*3]
-     TR_ADD_SSE_16_32_8  0, r0,      r0+r2
-     TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+ ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
 -cglobal hevc_add_residual_16_8, 3, 5, 7
++cglobal hevc_add_residual_16_8, 3, 4, 7
+     pxor                m0, m0
+     lea                 r3, [r2*3]
 -    mov                r4d, 4
 -.loop:
+     ADD_RES_SSE_16_32_8  0, r0,      r0+r2
+     ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
 +%rep 3
-     add                r1, 128
-     lea                r0, [r0+r2*4]
-     TR_ADD_SSE_16_32_8  0, r0,      r0+r2
-     TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+     add                 r1, 128
+     lea                 r0, [r0+r2*4]
 -    dec                r4d
 -    jg .loop
++    ADD_RES_SSE_16_32_8  0, r0,      r0+r2
++    ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
 +%endrep
      RET
  
- ; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual32_8, 3, 4, 7
-     pxor               m0, m0
-     TR_ADD_SSE_16_32_8  0, r0,    r0+16
-     TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+ ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
 -cglobal hevc_add_residual_32_8, 3, 5, 7
++cglobal hevc_add_residual_32_8, 3, 4, 7
+     pxor                m0, m0
 -    mov                r4d, 16
 -.loop:
+     ADD_RES_SSE_16_32_8  0, r0,    r0+16
+     ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
 +%rep 15
-     add                r1, 128
-     lea                r0, [r0+r2*2]
-     TR_ADD_SSE_16_32_8  0, r0,    r0+16
-     TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+     add                 r1, 128
+     lea                 r0, [r0+r2*2]
 -    dec                r4d
 -    jg .loop
++    ADD_RES_SSE_16_32_8  0, r0,    r0+16
++    ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
 +%endrep
      RET
  %endmacro
  
@@@ -179,25 -163,22 +179,22 @@@ TRANSFORM_ADD_
  
  %if HAVE_AVX2_EXTERNAL
  INIT_YMM avx2
- ; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_add_residual32_8, 3, 4, 7
-     pxor              m0, m0
-     lea               r3, [r2*3]
-     TR_ADD_SSE_16_32_8   0, r0,      r0+r2
-     TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+ ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
 -cglobal hevc_add_residual_32_8, 3, 5, 7
++cglobal hevc_add_residual_32_8, 3, 4, 7
+     pxor                 m0, m0
+     lea                  r3, [r2*3]
 -    mov                 r4d, 8
 -.loop:
+     ADD_RES_SSE_16_32_8   0, r0,      r0+r2
+     ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
 +%rep 7
-     add                r1, 256
-     lea                r0, [r0+r2*4]
-     TR_ADD_SSE_16_32_8   0, r0,      r0+r2
-     TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+     add                  r1, 256
+     lea                  r0, [r0+r2*4]
 -    dec                 r4d
 -    jg .loop
++    ADD_RES_SSE_16_32_8   0, r0,      r0+r2
++    ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
 +%endrep
      RET
 -%endif ;HAVE_AVX2_EXTERNAL
 +%endif
  
- ;-----------------------------------------------------------------------------
- ; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
- ;-----------------------------------------------------------------------------
- %macro TR_ADD_SSE_8_10 4
+ %macro ADD_RES_SSE_8_10 4
      mova              m0, [%4]
      mova              m1, [%4+16]
      mova              m2, [%4+32]
@@@ -308,81 -289,81 +305,77 @@@
      mova      [%1+%2+32], m3
  %endmacro
  
- 
+ ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
  INIT_MMX mmxext
- cglobal hevc_add_residual4_10,3,4, 6
 -cglobal hevc_add_residual_4_10, 3, 3, 6
++cglobal hevc_add_residual_4_10, 3, 4, 6
      pxor              m2, m2
      mova              m3, [max_pixels_10]
-     TR_ADD_MMX4_10     r0, r2, r1
+     ADD_RES_MMX_4_10  r0, r2, r1
      add               r1, 16
      lea               r0, [r0+2*r2]
-     TR_ADD_MMX4_10     r0, r2, r1
+     ADD_RES_MMX_4_10  r0, r2, r1
      RET
  
- ;-----------------------------------------------------------------------------
- ; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
- ;-----------------------------------------------------------------------------
  INIT_XMM sse2
- cglobal hevc_add_residual8_10,3,4,6
+ cglobal hevc_add_residual_8_10, 3, 4, 6
      pxor              m4, m4
      mova              m5, [max_pixels_10]
      lea               r3, [r2*3]
  
-     TR_ADD_SSE_8_10      r0, r2, r3, r1
+     ADD_RES_SSE_8_10  r0, r2, r3, r1
      lea               r0, [r0+r2*4]
      add               r1, 64
-     TR_ADD_SSE_8_10      r0, r2, r3, r1
+     ADD_RES_SSE_8_10  r0, r2, r3, r1
      RET
  
- cglobal hevc_add_residual16_10,3,4,6
 -cglobal hevc_add_residual_16_10, 3, 5, 6
++cglobal hevc_add_residual_16_10, 3, 4, 6
      pxor              m4, m4
      mova              m5, [max_pixels_10]
  
-     TRANS_ADD_SSE_16_10 r0, r2, r1
 -    mov              r4d, 8
 -.loop:
+     ADD_RES_SSE_16_10 r0, r2, r1
 +%rep 7
-     lea                 r0, [r0+r2*2]
-     add                 r1, 64
-     TRANS_ADD_SSE_16_10 r0, r2, r1
+     lea               r0, [r0+r2*2]
+     add               r1, 64
 -    dec              r4d
 -    jg .loop
++    ADD_RES_SSE_16_10 r0, r2, r1
 +%endrep
      RET
  
- cglobal hevc_add_residual32_10,3,4,6
 -cglobal hevc_add_residual_32_10, 3, 5, 6
++cglobal hevc_add_residual_32_10, 3, 4, 6
      pxor              m4, m4
      mova              m5, [max_pixels_10]
  
-     TRANS_ADD_SSE_32_10 r0, r1
 -    mov              r4d, 32
 -.loop
+     ADD_RES_SSE_32_10 r0, r1
 +%rep 31
-     lea                 r0, [r0+r2]
-     add                 r1, 64
-     TRANS_ADD_SSE_32_10 r0, r1
+     lea               r0, [r0+r2]
+     add               r1, 64
 -    dec              r4d
 -    jg .loop
++    ADD_RES_SSE_32_10 r0, r1
 +%endrep
      RET
  
  %if HAVE_AVX2_EXTERNAL
  INIT_YMM avx2
 -cglobal hevc_add_residual_16_10, 3, 5, 6
++cglobal hevc_add_residual_16_10, 3, 4, 6
+     pxor               m4, m4
+     mova               m5, [max_pixels_10]
+     lea                r3, [r2*3]
  
- cglobal hevc_add_residual16_10,3,4,6
-     pxor              m4, m4
-     mova              m5, [max_pixels_10]
-     lea               r3, [r2*3]
- 
-     TRANS_ADD16_AVX2  r0, r2, r3, r1
 -    mov               r4d, 4
 -.loop
+     ADD_RES_AVX2_16_10 r0, r2, r3, r1
 +%rep 3
-     lea               r0, [r0+r2*4]
-     add               r1, 128
-     TRANS_ADD16_AVX2  r0, r2, r3, r1
+     lea                r0, [r0+r2*4]
+     add                r1, 128
 -    dec               r4d
 -    jg .loop
++    ADD_RES_AVX2_16_10 r0, r2, r3, r1
 +%endrep
      RET
  
- cglobal hevc_add_residual32_10,3,4,6
-     pxor              m4, m4
-     mova              m5, [max_pixels_10]
 -cglobal hevc_add_residual_32_10, 3, 5, 6
++cglobal hevc_add_residual_32_10, 3, 4, 6
+     pxor               m4, m4
+     mova               m5, [max_pixels_10]
  
-     TRANS_ADD32_AVX2  r0, r2, r1
 -    mov               r4d, 16
 -.loop
+     ADD_RES_AVX2_32_10 r0, r2, r1
 +%rep 15
-     lea               r0, [r0+r2*2]
-     add               r1, 128
-     TRANS_ADD32_AVX2  r0, r2, r1
+     lea                r0, [r0+r2*2]
+     add                r1, 128
 -    dec               r4d
 -    jg .loop
++    ADD_RES_AVX2_32_10 r0, r2, r1
 +%endrep
      RET
- %endif ;HAVE_AVX_EXTERNAL
+ %endif ;HAVE_AVX2_EXTERNAL
diff --cc libavcodec/x86/hevcdsp.h
index 63a148e,0000000..67be0a9
mode 100644,000000..100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@@ -1,258 -1,0 +1,259 @@@
 +/*
 + * HEVC video decoder
 + *
 + * Copyright (C) 2012 - 2013 Guillaume Martres
 + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
 + *
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_X86_HEVCDSP_H
 +#define AVCODEC_X86_HEVCDSP_H
 +
 +#include <stddef.h>
 +#include <stdint.h>
 +
 +
 +#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 +dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
 +
 +
 +#define PEL_PROTOTYPE(name, D, opt) \
 +void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
 +void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// MC functions
 +///////////////////////////////////////////////////////////////////////////////
 +
 +#define EPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##6,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define QPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
 +void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
 +void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
 +
 +#define WEIGHTING_PROTOTYPES(bitd, opt) \
 +        WEIGHTING_PROTOTYPE(2, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(4, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(6, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(8, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(12, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(16, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(24, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(32, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(48, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(64, bitd, opt)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL_PIXELS EPEL_PIXELS
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 10, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 12, sse4);
 +
 +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +
 +
 +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
 +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
 +
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// EPEL
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(epel_h ,  8, sse4);
 +EPEL_PROTOTYPES(epel_h , 10, sse4);
 +EPEL_PROTOTYPES(epel_h , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_v ,  8, sse4);
 +EPEL_PROTOTYPES(epel_v , 10, sse4);
 +EPEL_PROTOTYPES(epel_v , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 +EPEL_PROTOTYPES(epel_hv , 10, sse4);
 +EPEL_PROTOTYPES(epel_hv , 12, sse4);
 +
 +PEL_PROTOTYPE(epel_h16, 8, avx2);
 +PEL_PROTOTYPE(epel_h24, 8, avx2);
 +PEL_PROTOTYPE(epel_h32, 8, avx2);
 +PEL_PROTOTYPE(epel_h48, 8, avx2);
 +PEL_PROTOTYPE(epel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_h16,10, avx2);
 +PEL_PROTOTYPE(epel_h24,10, avx2);
 +PEL_PROTOTYPE(epel_h32,10, avx2);
 +PEL_PROTOTYPE(epel_h48,10, avx2);
 +PEL_PROTOTYPE(epel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_v16, 8, avx2);
 +PEL_PROTOTYPE(epel_v24, 8, avx2);
 +PEL_PROTOTYPE(epel_v32, 8, avx2);
 +PEL_PROTOTYPE(epel_v48, 8, avx2);
 +PEL_PROTOTYPE(epel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_v16,10, avx2);
 +PEL_PROTOTYPE(epel_v24,10, avx2);
 +PEL_PROTOTYPE(epel_v32,10, avx2);
 +PEL_PROTOTYPE(epel_v48,10, avx2);
 +PEL_PROTOTYPE(epel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16, 8, avx2);
 +PEL_PROTOTYPE(epel_hv24, 8, avx2);
 +PEL_PROTOTYPE(epel_hv32, 8, avx2);
 +PEL_PROTOTYPE(epel_hv48, 8, avx2);
 +PEL_PROTOTYPE(epel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16,10, avx2);
 +PEL_PROTOTYPE(epel_hv24,10, avx2);
 +PEL_PROTOTYPE(epel_hv32,10, avx2);
 +PEL_PROTOTYPE(epel_hv48,10, avx2);
 +PEL_PROTOTYPE(epel_hv64,10, avx2);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL
 +///////////////////////////////////////////////////////////////////////////////
 +QPEL_PROTOTYPES(qpel_h ,  8, sse4);
 +QPEL_PROTOTYPES(qpel_h , 10, sse4);
 +QPEL_PROTOTYPES(qpel_h , 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_v,  8, sse4);
 +QPEL_PROTOTYPES(qpel_v, 10, sse4);
 +QPEL_PROTOTYPES(qpel_v, 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 12, sse4);
 +
 +PEL_PROTOTYPE(qpel_h16, 8, avx2);
 +PEL_PROTOTYPE(qpel_h24, 8, avx2);
 +PEL_PROTOTYPE(qpel_h32, 8, avx2);
 +PEL_PROTOTYPE(qpel_h48, 8, avx2);
 +PEL_PROTOTYPE(qpel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_h16,10, avx2);
 +PEL_PROTOTYPE(qpel_h24,10, avx2);
 +PEL_PROTOTYPE(qpel_h32,10, avx2);
 +PEL_PROTOTYPE(qpel_h48,10, avx2);
 +PEL_PROTOTYPE(qpel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16, 8, avx2);
 +PEL_PROTOTYPE(qpel_v24, 8, avx2);
 +PEL_PROTOTYPE(qpel_v32, 8, avx2);
 +PEL_PROTOTYPE(qpel_v48, 8, avx2);
 +PEL_PROTOTYPE(qpel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16,10, avx2);
 +PEL_PROTOTYPE(qpel_v24,10, avx2);
 +PEL_PROTOTYPE(qpel_v32,10, avx2);
 +PEL_PROTOTYPE(qpel_v48,10, avx2);
 +PEL_PROTOTYPE(qpel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv24, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv32, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv48, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16,10, avx2);
 +PEL_PROTOTYPE(qpel_hv24,10, avx2);
 +PEL_PROTOTYPE(qpel_hv32,10, avx2);
 +PEL_PROTOTYPE(qpel_hv48,10, avx2);
 +PEL_PROTOTYPE(qpel_hv64,10, avx2);
 +
 +WEIGHTING_PROTOTYPES(8, sse4);
 +WEIGHTING_PROTOTYPES(10, sse4);
 +WEIGHTING_PROTOTYPES(12, sse4);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// TRANSFORM_ADD
 +///////////////////////////////////////////////////////////////////////////////
- void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
- void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 +
- void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 +
- void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 +
- void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++
++void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 +
 +#endif // AVCODEC_X86_HEVCDSP_H
diff --cc libavcodec/x86/hevcdsp_init.c
index 0b17671,a95fa30..17cd233
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -734,153 -315,37 +735,153 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
              c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
              c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
  
-             c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
-             c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
-             c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
 -            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
++            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
++            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
++            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
          }
          if (EXTERNAL_SSSE3(cpu_flags)) {
 -            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
 -            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
 +            if(ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            }
 +            SAO_EDGE_INIT(8, ssse3);
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
 +
 +                c->idct[2] = ff_hevc_idct_16x16_8_avx;
 +                c->idct[3] = ff_hevc_idct_32x32_8_avx;
 +            }
 +            SAO_BAND_INIT(8, avx);
 +
              c->idct[0] = ff_hevc_idct_4x4_8_avx;
              c->idct[1] = ff_hevc_idct_8x8_8_avx;
 +
-             c->add_residual[1] = ff_hevc_add_residual8_8_avx;
-             c->add_residual[2] = ff_hevc_add_residual16_8_avx;
-             c->add_residual[3] = ff_hevc_add_residual32_8_avx;
+             c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
+             c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
+             c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
 +            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
 +            }
 +            SAO_BAND_INIT(8, avx2);
 +
 +            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
 +            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
 +            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 +
-             c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
+             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
          }
      } else if (bit_depth == 10) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
++            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
              c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
              c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
 -
 -            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
          }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
@@@ -901,251 -357,89 +902,250 @@@
  
              c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
              c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
 -            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
  
-             c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
-             c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
-             c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
+             c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
+             c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
+             c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
          }
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
 +        }
          if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
 +
 +                c->idct[2] = ff_hevc_idct_16x16_10_avx;
 +                c->idct[3] = ff_hevc_idct_32x32_10_avx;
 +            }
 +
              c->idct[0] = ff_hevc_idct_4x4_10_avx;
              c->idct[1] = ff_hevc_idct_8x8_10_avx;
 +
 +            SAO_BAND_INIT(10, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
 -            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
          }
 -    }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  
 -#if ARCH_X86_64
 -    if (bit_depth == 8) {
 -        if (EXTERNAL_SSE2(cpu_flags)) {
 -            c->idct[2] = ff_hevc_idct_16x16_8_sse2;
 -            c->idct[3] = ff_hevc_idct_32x32_8_sse2;
 -        }
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 -        }
 +                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
 -        }
 +                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
  
 -        if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 -            c->idct[2] = ff_hevc_idct_16x16_8_avx;
 -            c->idct[3] = ff_hevc_idct_32x32_8_avx;
 +                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
 +                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
 +                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
 +                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
 +                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
 +                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
 +                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
 +            }
 +            SAO_BAND_INIT(10, avx2);
 +            SAO_EDGE_INIT(10, avx2);
 +
-             c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
-             c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
- 
++            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
++            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
          }
 -        if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +    } else if (bit_depth == 12) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
 +            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
 +            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
          }
 -    } else if (bit_depth == 10) {
          if (EXTERNAL_SSE2(cpu_flags)) {
 -            c->idct[2] = ff_hevc_idct_16x16_10_sse2;
 -            c->idct[3] = ff_hevc_idct_32x32_10_sse2;
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 +            }
 +            SAO_BAND_INIT(12, sse2);
 +            SAO_EDGE_INIT(12, sse2);
 +
 +            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
          }
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
          }
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
 -            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
 -            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 -            c->idct[2] = ff_hevc_idct_16x16_10_avx;
 -            c->idct[3] = ff_hevc_idct_32x32_10_avx;
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
 +            }
 +            SAO_BAND_INIT(12, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
 +
 +            SAO_BAND_INIT(12, avx2);
 +            SAO_EDGE_INIT(12, avx2);
          }
      }
 -#endif /* ARCH_X86_64 */
  }




More information about the ffmpeg-cvslog mailing list