[FFmpeg-devel] [PATCH 1/7] x86: hevc_mc: add AVX2 optimizations
Christophe Gisquet
christophe.gisquet at gmail.com
Thu Feb 5 20:20:39 CET 2015
From: plepere <pierre-edouard.lepere at insa-rennes.fr>
before
33304 decicycles in luma_bi_1, 523066 runs, 1222 skips
38138 decicycles in luma_bi_2, 523427 runs, 861 skips
13490 decicycles in luma_uni, 516138 runs, 8150 skips
after
20185 decicycles in luma_bi_1, 519970 runs, 4318 skips
24620 decicycles in luma_bi_2, 521024 runs, 3264 skips
10397 decicycles in luma_uni, 515715 runs, 8573 skips
Conflicts:
libavcodec/x86/hevc_mc.asm
libavcodec/x86/hevcdsp_init.c
---
libavcodec/x86/hevc_mc.asm | 581 +++++++++++++++++++++++++++++++-----------
libavcodec/x86/hevcdsp.h | 105 ++++++++
libavcodec/x86/hevcdsp_init.c | 370 ++++++++++++++++++++++++++-
3 files changed, 908 insertions(+), 148 deletions(-)
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 8f9f939..efb4d1f 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -20,19 +20,20 @@
; */
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pw_8: times 8 dw (1 << 9)
-pw_10: times 8 dw (1 << 11)
-pw_12: times 8 dw (1 << 13)
-pw_bi_8: times 8 dw (1 << 8)
-pw_bi_10: times 8 dw (1 << 10)
-pw_bi_12: times 8 dw (1 << 12)
-max_pixels_10: times 8 dw ((1 << 10)-1)
-max_pixels_12: times 8 dw ((1 << 12)-1)
-zero: times 4 dd 0
-one_per_32: times 4 dd 1
-
-SECTION .text
+SECTION_RODATA 32
+pw_8: times 16 dw (1 << 9)
+pw_10: times 16 dw (1 << 11)
+pw_12: times 16 dw (1 << 13)
+pw_bi_8: times 16 dw (1 << 8)
+pw_bi_10: times 16 dw (1 << 10)
+pw_bi_12: times 16 dw (1 << 12)
+max_pixels_8: times 16 dw ((1 << 8)-1)
+max_pixels_10: times 16 dw ((1 << 10)-1)
+max_pixels_12: times 16 dw ((1 << 12)-1)
+zero: times 8 dd 0
+one_per_32: times 8 dd 1
+
+SECTION_TEXT 32
%macro EPEL_TABLE 4
hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
times %2 d%3 10, -2
@@ -51,6 +52,8 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
%endmacro
+EPEL_TABLE 8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
EPEL_TABLE 8, 8, b, sse4
EPEL_TABLE 10, 4, w, sse4
@@ -75,10 +78,15 @@ QPEL_TABLE 8, 8, b, sse4
QPEL_TABLE 10, 4, w, sse4
QPEL_TABLE 12, 4, w, sse4
+QPEL_TABLE 8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
+
%define MAX_PB_SIZE 64
%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
+
%if ARCH_X86_64
%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
@@ -87,11 +95,22 @@ QPEL_TABLE 12, 4, w, sse4
%elif %1 <= 8
movdqa %3, [%2] ; load data from source2
%elif %1 <= 12
+%if cpuflag(avx2)
+ mova %3, [%2]
+%else
movdqa %3, [%2] ; load data from source2
movq %4, [%2+16] ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+ movu %3, [%2]
%else
movdqa %3, [%2] ; load data from source2
movdqa %4, [%2+16] ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+ movu %3, [%2]
+ movu %4, [%2+32]
%endif
%endmacro
@@ -100,71 +119,108 @@ QPEL_TABLE 12, 4, w, sse4
movd %4, [%3] ; load data from source
%elif %1 == 4 || (%2 == 8 && %1 <= 8)
movq %4, [%3] ; load data from source
+%elif notcpuflag(avx)
+ movu %4, [%3] ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+ movdqu %4, [%3]
%else
- movdqu %4, [%3] ; load data from source
+ movu %4, [%3]
%endif
%endmacro
-%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
-%if %1 == 2 || (%2 == 8 && %1 <= 4)
- movq %4, [%3] ; load data from source2
-%elif %1 == 4 || (%2 == 8 && %1 <= 8)
- movdqa %4, [%3] ; load data from source2
-%elif %1 <= 12
- movdqa %4, [%3] ; load data from source2
- movq %5, [%3+16] ; load data from source2
-%else
- movdqa %4, [%3] ; load data from source2
- movdqa %5, [%3+16] ; load data from source2
-%endif
-%endmacro
%macro EPEL_FILTER 2-4 ; bit depth, filter index
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+ lea rfilterq, [hevc_epel_filters_avx2_%1]
+%else
+ %define rfilterq hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
%ifdef PIC
lea rfilterq, [hevc_epel_filters_sse4_%1]
%else
%define rfilterq hevc_epel_filters_sse4_%1
%endif
+%endif ;cpuflag(avx2)
sub %2q, 1
+%if cpuflag(avx2)
+ shl %2q, 6 ; multiply by 64
+ %else
shl %2q, 5 ; multiply by 32
- movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
- movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
+%endif
+%if %0 == 2
+ mova m14, [rfilterq + %2q] ; get 2 first values of filters
+ mova m15, [rfilterq + %2q+%%offset] ; get 2 last values of filters
+%else
+ mova %3, [rfilterq + %2q] ; get 2 first values of filters
+ mova %4, [rfilterq + %2q+%%offset] ; get 2 last values of filters
+%endif
%endmacro
%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift 6
+%define %%table hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift 5
+%define %%table hevc_epel_filters_sse4_%1
+%endif
+
%ifdef PIC
- lea rfilterq, [hevc_epel_filters_sse4_%1]
+ lea rfilterq, [%%table]
%else
- %define rfilterq hevc_epel_filters_sse4_%1
+ %define rfilterq %%table
%endif
sub mxq, 1
sub myq, 1
- shl mxq, 5 ; multiply by 32
- shl myq, 5 ; multiply by 32
- movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
- movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
+ mova m14, [rfilterq + mxq] ; get 2 first values of filters
+ mova m15, [rfilterq + mxq+%%offset] ; get 2 last values of filters
lea r3srcq, [srcstrideq*3]
+%if cpuflag(avx2)
+%define %%table hevc_epel_filters_avx2_10
+%else
+%define %%table hevc_epel_filters_sse4_10
+%endif
%ifdef PIC
- lea rfilterq, [hevc_epel_filters_sse4_10]
+ lea rfilterq, [%%table]
%else
- %define rfilterq hevc_epel_filters_sse4_10
+ %define rfilterq %%table
%endif
- movdqa m12, [rfilterq + myq] ; get 2 first values of filters
- movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
+ mova m12, [rfilterq + myq] ; get 2 first values of filters
+ mova m13, [rfilterq + myq+%%offset] ; get 2 last values of filters
%endmacro
%macro QPEL_FILTER 2
+
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift 7
+%define %%table hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift 6
+%define %%table hevc_qpel_filters_sse4_%1
+%endif
+
%ifdef PIC
- lea rfilterq, [hevc_qpel_filters_sse4_%1]
+ lea rfilterq, [%%table]
%else
- %define rfilterq hevc_qpel_filters_sse4_%1
+ %define rfilterq %%table
%endif
- lea %2q, [%2q*8-8]
- movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
- movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
- movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
- movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
+ sub %2q, 1
+ shl %2q, %%shift ; multiply by 32
+ mova m12, [rfilterq + %2q] ; get 4 first values of filters
+ mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
+ mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
+ mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
%endmacro
%macro EPEL_LOAD 4
@@ -191,19 +247,18 @@ QPEL_TABLE 12, 4, w, sse4
%%load m2, [rfilterq+2*%3q]
%%load m3, [rfilterq+r3srcq]
%endif
-
%if %1 == 8
%if %4 > 8
- SBUTTERFLY bw, 0, 1, 10
- SBUTTERFLY bw, 2, 3, 10
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
%else
punpcklbw m0, m1
punpcklbw m2, m3
%endif
%else
%if %4 > 4
- SBUTTERFLY wd, 0, 1, 10
- SBUTTERFLY wd, 2, 3, 10
+ SBUTTERFLY wd, 0, 1, 7
+ SBUTTERFLY wd, 2, 3, 7
%else
punpcklwd m0, m1
punpcklwd m2, m3
@@ -220,7 +275,7 @@ QPEL_TABLE 12, 4, w, sse4
%elif %3 == 8
%define %%load movq
%else
-%define %%load movdqu
+%define %%load movu
%endif
%else
%if %3 == 2
@@ -228,7 +283,7 @@ QPEL_TABLE 12, 4, w, sse4
%elif %3 == 4
%define %%load movq
%else
-%define %%load movdqu
+%define %%load movu
%endif
%endif
%%load m0, [%2-3*%%stride] ;load data from source
@@ -247,10 +302,10 @@ QPEL_TABLE 12, 4, w, sse4
SBUTTERFLY wd, 4, 5, %4
SBUTTERFLY wd, 6, 7, %4
%else
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpcklwd m4, m5
- punpcklwd m6, m7
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
%endif
%else
%if %3 > 4
@@ -259,10 +314,10 @@ QPEL_TABLE 12, 4, w, sse4
SBUTTERFLY dq, 4, 5, %4
SBUTTERFLY dq, 6, 7, %4
%else
- punpckldq m0, m1
- punpckldq m2, m3
- punpckldq m4, m5
- punpckldq m6, m7
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
%endif
%endif
%endmacro
@@ -270,14 +325,14 @@ QPEL_TABLE 12, 4, w, sse4
%macro QPEL_V_LOAD 5
lea %5q, [%2]
sub %5q, r3srcq
- movdqu m0, [%5q ] ;load x- 3*srcstride
- movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
- movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
- movdqu m3, [%2 ] ;load x
- movdqu m4, [%2+ %3q] ;load x+stride
- movdqu m5, [%2+ 2*%3q] ;load x+2*stride
- movdqu m6, [%2+r3srcq] ;load x+3*stride
- movdqu m7, [%2+ 4*%3q] ;load x+4*stride
+ movu m0, [%5q ] ;load x- 3*srcstride
+ movu m1, [%5q+ %3q ] ;load x- 2*srcstride
+ movu m2, [%5q+ 2*%3q ] ;load x-srcstride
+ movu m3, [%2 ] ;load x
+ movu m4, [%2+ %3q] ;load x+stride
+ movu m5, [%2+ 2*%3q] ;load x+2*stride
+ movu m6, [%2+r3srcq] ;load x+3*stride
+ movu m7, [%2+ 4*%3q] ;load x+4*stride
%if %1 == 8
%if %4 > 8
SBUTTERFLY bw, 0, 1, 8
@@ -347,8 +402,17 @@ QPEL_TABLE 12, 4, w, sse4
movq [%1+16], %3
%endmacro
%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+ movu [%1], %2
+%else
PEL_10STORE8 %1, %2, %3
movdqa [%1+16], %3
+%endif
+%endmacro
+
+%macro PEL_10STORE32 3
+ PEL_10STORE16 %1, %2, %3
+ movu [%1+32], %3
%endmacro
%macro PEL_8STORE2 3
@@ -370,7 +434,14 @@ QPEL_TABLE 12, 4, w, sse4
movd [%1+8], %2
%endmacro
%macro PEL_8STORE16 3
- movdqa [%1], %2
+%if cpuflag(avx2)
+ movdqu [%1], %2
+%else
+ mova [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+ movu [%1], %2
%endmacro
%macro LOOP_END 3
@@ -381,65 +452,109 @@ QPEL_TABLE 12, 4, w, sse4
%endmacro
-%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+ vextracti128 xm1, m0, 1
+ pmovzxbw m1, xm1
+ psllw m1, 14-%2
+%endif
+ pmovzxbw m0, xm0
+%else ; not avx
%if %1 > 8
- punpckhbw m1, m0, m2
- psllw m1, 14-%2
+ punpckhbw m1, m0, m2
+ psllw m1, 14-%2
%endif
- punpcklbw m0, m2
+ punpcklbw m0, m2
%endif
- psllw m0, 14-%2
+%endif ;avx
+ psllw m0, 14-%2
%endmacro
-
-%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
%if %1 == 8
- pmaddubsw m0, %3 ;x1*c1+x2*c2
- pmaddubsw m2, %4 ;x3*c3+x4*c4
- paddw m0, m2
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+ vextracti128 xm10, m0, 1
+ vinserti128 m10, m1, xm10, 0
+%endif
+ vinserti128 m0, m0, xm1, 1
+ mova m1, m10
+%if %2 > 16
+ vextracti128 xm10, m2, 1
+ vinserti128 m10, m3, xm10, 0
+%endif
+ vinserti128 m2, m2, xm3, 1
+ mova m3, m10
+%endif
+ pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
+ pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
+ paddw %%reg0, %%reg2
%if %2 > 8
- pmaddubsw m1, %3
- pmaddubsw m3, %4
- paddw m1, m3
+ pmaddubsw %%reg1, %3
+ pmaddubsw %%reg3, %4
+ paddw %%reg1, %%reg3
%endif
%else
- pmaddwd m0, %3
- pmaddwd m2, %4
- paddd m0, m2
+ pmaddwd %%reg0, %3
+ pmaddwd %%reg2, %4
+ paddd %%reg0, %%reg2
%if %2 > 4
- pmaddwd m1, %3
- pmaddwd m3, %4
- paddd m1, m3
+ pmaddwd %%reg1, %3
+ pmaddwd %%reg3, %4
+ paddd %%reg1, %%reg3
+%if %1 != 8
+ psrad %%reg1, %1-8
+%endif
%endif
%if %1 != 8
- psrad m0, %1-8
- psrad m1, %1-8
+ psrad %%reg0, %1-8
%endif
- packssdw m0, m1
+ packssdw %%reg0, %%reg1
%endif
%endmacro
%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
+
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table hevc_qpel_filters_sse4_%2
+%endif
+
%ifdef PIC
- lea rfilterq, [hevc_qpel_filters_sse4_%2]
+ lea rfilterq, [%%table]
%else
- %define rfilterq hevc_qpel_filters_sse4_%2
+ %define rfilterq %%table
%endif
%if %2 == 8
pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
- pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
- pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
- pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
+ pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
+ pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
+ pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
paddw m0, m2
paddw m4, m6
paddw m0, m4
%else
pmaddwd m0, [rfilterq + %3q*8 ]
- pmaddwd m2, [rfilterq + %3q*8+16]
- pmaddwd m4, [rfilterq + %3q*8+32]
- pmaddwd m6, [rfilterq + %3q*8+48]
+ pmaddwd m2, [rfilterq + %3q*8+%%offset]
+ pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
+ pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
paddd m0, m2
paddd m4, m6
paddd m0, m4
@@ -448,9 +563,9 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%if %1 > 4
pmaddwd m1, [rfilterq + %3q*8 ]
- pmaddwd m3, [rfilterq + %3q*8+16]
- pmaddwd m5, [rfilterq + %3q*8+32]
- pmaddwd m7, [rfilterq + %3q*8+48]
+ pmaddwd m3, [rfilterq + %3q*8+%%offset]
+ pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
+ pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
paddd m1, m3
paddd m5, m7
paddd m1, m5
@@ -462,8 +577,32 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%endmacro
-%macro QPEL_COMPUTE 2 ; width, bitdepth
+%macro QPEL_COMPUTE 2-3 ; width, bitdepth
%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
+
+ vextracti128 xm10, m0, 1
+ vinserti128 m10, m1, xm10, 0
+ vinserti128 m0, m0, xm1, 1
+ mova m1, m10
+
+ vextracti128 xm10, m2, 1
+ vinserti128 m10, m3, xm10, 0
+ vinserti128 m2, m2, xm3, 1
+ mova m3, m10
+
+
+ vextracti128 xm10, m4, 1
+ vinserti128 m10, m5, xm10, 0
+ vinserti128 m4, m4, xm5, 1
+ mova m5, m10
+
+ vextracti128 xm10, m6, 1
+ vinserti128 m10, m7, xm10, 0
+ vinserti128 m6, m6, xm7, 1
+ mova m7, m10
+%endif
+
pmaddubsw m0, m12 ;x1*c1+x2*c2
pmaddubsw m2, m13 ;x3*c3+x4*c4
pmaddubsw m4, m14 ;x5*c5+x6*c6
@@ -506,12 +645,16 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%endmacro
-%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
paddsw %3, %5
%if %1 > 8
paddsw %4, %6
%endif
UNI_COMPUTE %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+ vpermq %3, %3, 216
+ vpermq %4, %4, 216
+%endif
%endmacro
%macro UNI_COMPUTE 5
@@ -524,14 +667,14 @@ QPEL_TABLE 12, 4, w, sse4
%else
pminsw %3, [max_pixels_%2]
pmaxsw %3, [zero]
-%if %1 > 8
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
pminsw %4, [max_pixels_%2]
pmaxsw %4, [zero]
%endif
%endif
%endmacro
-INIT_XMM sse4 ; adds ff_ and _sse4 to function name
+
; ******************************
; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
; uint8_t *_src, ptrdiff_t _srcstride,
@@ -539,15 +682,23 @@ INIT_XMM sse4 ; adds ff_ and _sse4 to functio
; ******************************
%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS %1, %2
+%endmacro
+
+%macro HEVC_PEL_PIXELS 2
cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
pxor m2, m2
.loop
SIMPLE_LOAD %1, %2, srcq, m0
- MC_PIXEL_COMPUTE %1, %2
+ MC_PIXEL_COMPUTE %1, %2, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
+ %endmacro
+%macro HEVC_UNI_PEL_PIXELS 2
cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
.loop
SIMPLE_LOAD %1, %2, srcq, m0
@@ -557,15 +708,17 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri
dec heightd ; cmp height
jnz .loop ; height loop
RET
+%endmacro
+%macro HEVC_BI_PEL_PIXELS 2
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
movdqa m5, [pw_bi_%2]
.loop
SIMPLE_LOAD %1, %2, srcq, m0
SIMPLE_BILOAD %1, src2q, m3, m4
- MC_PIXEL_COMPUTE %1, %2
- BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
+ MC_PIXEL_COMPUTE %1, %2, 1
+ BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -573,7 +726,6 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
dec heightd ; cmp height
jnz .loop ; height loop
RET
-
%endmacro
@@ -591,7 +743,7 @@ cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rf
EPEL_FILTER %2, mx, m4, m5
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5
+ EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
@@ -616,9 +768,9 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride,
EPEL_FILTER %2, mx, m4, m5
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5
+ EPEL_COMPUTE %2, %1, m4, m5, 1
SIMPLE_BILOAD %1, src2q, m2, m3
- BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -640,7 +792,7 @@ cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src,
EPEL_FILTER %2, my, m4, m5
.loop
EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5
+ EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
@@ -669,9 +821,9 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride,
EPEL_FILTER %2, my, m4, m5
.loop
EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5
+ EPEL_COMPUTE %2, %1, m4, m5, 1
SIMPLE_BILOAD %1, src2q, m2, m3
- BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -695,19 +847,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
SWAP m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
@@ -716,10 +880,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+ vinserti128 m2, m0, xm4, 1
+ vextracti128 xm3, m0, 1
+ vinserti128 m3, m4, xm3, 0
+ PEL_10STORE%1 dstq, m2, m3
+%else
+ PEL_10STORE%1 dstq, m0, m4
+%endif
+%else
PEL_10STORE%1 dstq, m0, m1
+%endif
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
LOOP_END dst, src, srcstride
RET
@@ -729,20 +914,32 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
- SWAP m7, m0
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
+ mova m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
%if %1 > 4
@@ -750,37 +947,62 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+ UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
+%else
UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
+%endif
PEL_%2STORE%1 dstq, m0, m1
- movdqa m4, m5
- movdqa m5, m6
- movdqa m6, m7
+ mova m4, m5
+ mova m5, m6
+ mova m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
-
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m8, m1
+%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m9, m1
+%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m10, m1
+%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+ SWAP m11, m1
+%endif
SWAP m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
@@ -789,12 +1011,34 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
+ SIMPLE_BILOAD %1, src2q, m8, m3
+%if cpuflag(avx2)
+ vinserti128 m1, m8, xm3, 1
+ vextracti128 xm8, m8, 1
+ vinserti128 m2, m3, xm8, 0
+ BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+ BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
SIMPLE_BILOAD %1, src2q, m8, m9
BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
- PEL_%2STORE%1 dstq, m0, m1
- movdqa m4, m5
- movdqa m5, m6
- movdqa m6, m7
+%endif
+ PEL_%2STORE%1 dstq, m0, m4
+ mova m4, m5
+ mova m5, m6
+ mova m6, m7
+%if (%1 > 8 && (%2 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
add src2q, 2*MAX_PB_SIZE ; src += srcstride
@@ -814,7 +1058,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2
+ QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
@@ -823,7 +1067,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
RET
cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
- movdqa m9, [pw_%2]
+ mova m9, [pw_%2]
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
@@ -844,12 +1088,12 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2
+ QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
SIMPLE_BILOAD %1, src2q, m10, m11
- BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
+ BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -870,7 +1114,7 @@ cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src,
QPEL_FILTER %2, my
.loop
QPEL_V_LOAD %2, srcq, srcstride, %1, r7
- QPEL_COMPUTE %1, %2
+ QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
@@ -901,13 +1145,13 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop
- SIMPLE_BILOAD %1, src2q, m10, m11
QPEL_V_LOAD %2, srcq, srcstride, %1, r9
- QPEL_COMPUTE %1, %2
+ QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
- BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
+ SIMPLE_BILOAD %1, src2q, m10, m11
+ BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -925,8 +1169,15 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
- lea mxq, [mxq*8-8]
- lea myq, [myq*8-8]
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
@@ -994,8 +1245,15 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, m
RET
cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
- lea mxq, [mxq*8-8]
- lea myq, [myq*8-8]
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
@@ -1053,13 +1311,13 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
movq m13, m14
movq m14, m15
%else
- movdqa m8, m9
- movdqa m9, m10
- movdqa m10, m11
- movdqa m11, m12
- movdqa m12, m13
- movdqa m13, m14
- movdqa m14, m15
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+ mova m11, m12
+ mova m12, m13
+ mova m13, m14
+ mova m14, m15
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
@@ -1068,8 +1326,15 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
RET
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
- lea mxq, [mxq*8-8]
- lea myq, [myq*8-8]
+%if cpuflag(avx2)
+%assign %%shift 4
+%else
+%assign %%shift 3
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, %%shift ; multiply by 32
+ shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
@@ -1286,6 +1551,8 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2,
RET
%endmacro
+INIT_XMM sse4 ; adds ff_ and _sse4 to function name
+
WEIGHTING_FUNCS 2, 8
WEIGHTING_FUNCS 4, 8
WEIGHTING_FUNCS 6, 8
@@ -1340,6 +1607,7 @@ HEVC_PUT_HEVC_EPEL_HV 2, 8
HEVC_PUT_HEVC_EPEL_HV 4, 8
HEVC_PUT_HEVC_EPEL_HV 6, 8
HEVC_PUT_HEVC_EPEL_HV 8, 8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
HEVC_PUT_HEVC_EPEL_HV 2, 10
HEVC_PUT_HEVC_EPEL_HV 4, 10
@@ -1377,4 +1645,23 @@ HEVC_PUT_HEVC_QPEL_HV 4, 12
HEVC_PUT_HEVC_QPEL_HV 6, 12
HEVC_PUT_HEVC_QPEL_HV 8, 12
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
+
+%endif ;AVX2
%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 8dea142..7864163 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -96,6 +96,40 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
///////////////////////////////////////////////////////////////////////////////
// EPEL
///////////////////////////////////////////////////////////////////////////////
@@ -111,6 +145,42 @@ EPEL_PROTOTYPES(epel_hv , 8, sse4);
EPEL_PROTOTYPES(epel_hv , 10, sse4);
EPEL_PROTOTYPES(epel_hv , 12, sse4);
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
///////////////////////////////////////////////////////////////////////////////
// QPEL
///////////////////////////////////////////////////////////////////////////////
@@ -126,6 +196,41 @@ QPEL_PROTOTYPES(qpel_hv, 8, sse4);
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index f082f4d..b1533d8 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -165,6 +165,149 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts
#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width) \
+ \
+{ \
+ ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t _srcstride, int16_t *src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+ height, mx, my, width); \
+}
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
+ height, mx, my, width); \
+}
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width) \
+ \
+{ \
+ ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
+ ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t _srcstride, int16_t* src2, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
+ src2, height, mx, my, width); \
+ ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
+ src2+width2, height, mx, my, width); \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t _srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
+ height, mx, my, width); \
+ ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
+ height, mx, my, width); \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2); \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4);
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32);
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2);
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2);
+mc_rep_funcs(epel_h, 10, 16, 48, avx2);
+mc_rep_funcs(epel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2);
+mc_rep_funcs(epel_v, 10, 16, 48, avx2);
+mc_rep_funcs(epel_v, 10, 32, 64, avx2);
+
+
+mc_rep_funcs(epel_hv, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2);
+
+#endif //AVX2
+
mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
@@ -218,7 +361,6 @@ mc_rep_funcs(epel_hv, 8, 8, 64, sse4);
mc_rep_funcs(epel_hv, 8, 8, 48, sse4);
mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
mc_rep_funcs(epel_hv, 8, 8, 24, sse4);
-mc_rep_funcs(epel_hv, 8, 8, 16, sse4);
mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4);
mc_rep_funcs(epel_hv,10, 8, 64, sse4);
mc_rep_funcs(epel_hv,10, 8, 48, sse4);
@@ -619,6 +761,89 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
if (ARCH_X86_64) {
SAO_BAND_INIT(8, avx2);
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
}
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
@@ -685,6 +910,149 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
+
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+ c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+ c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
}
c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
--
1.9.2.msysgit.0
More information about the ffmpeg-devel
mailing list