[FFmpeg-devel] [PATCH 6/6] x86: hevc_mc: allow some functions for x86_32
Christophe Gisquet
christophe.gisquet at gmail.com
Sun Jun 1 16:13:02 CEST 2014
Now that the gpr/xmm register count has decreased, some functions are
usable by x86_32. Around 2% speedup.
---
libavcodec/x86/hevc_mc.asm | 831 +++++++++++++++++++++---------------------
libavcodec/x86/hevcdsp.h | 3 +
libavcodec/x86/hevcdsp_init.c | 26 +-
3 files changed, 441 insertions(+), 419 deletions(-)
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 3ed2662..efde131 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -72,20 +72,57 @@ QPEL_TABLE 10, 4, w, sse4
%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
-%if ARCH_X86_64
+INIT_XMM sse4
+%macro PEL_10STORE2 3
+ movd [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+ movq [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+ movdqa [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+ movdqa [%1], %2
+ movq [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+ PEL_10STORE8 %1, %2, %3
+ movdqa [%1+16], %3
+%endmacro
-%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
-%if %1 <= 4
- movq %3, [%2] ; load data from source2
-%elif %1 <= 8
- movdqa %3, [%2] ; load data from source2
-%elif %1 <= 12
- movdqa %3, [%2] ; load data from source2
- movq %4, [%2+16] ; load data from source2
-%else
- movdqa %3, [%2] ; load data from source2
- movdqa %4, [%2+16] ; load data from source2
-%endif
+%macro PEL_8STORE2 3
+ pextrw [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+ movd [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+ movd [%1], %2
+ pextrw [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+ movq [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+ movdqa [%1], %2
+%endmacro
+
+%macro LOOP_END 4
+ lea %1q, [%1q+2*%2q] ; dst += dststride
+ lea %3q, [%3q+ %4q] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
%endmacro
%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
@@ -98,17 +135,17 @@ QPEL_TABLE 10, 4, w, sse4
%endif
%endmacro
-%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
-%if %1 == 2 || (%2 == 8 && %1 <= 4)
- movq %4, [%3] ; load data from source2
-%elif %1 == 4 || (%2 == 8 && %1 <= 8)
- movdqa %4, [%3] ; load data from source2
+%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
+%if %1 <= 4
+ movq %3, [%2] ; load data from source2
+%elif %1 <= 8
+ movdqa %3, [%2] ; load data from source2
%elif %1 <= 12
- movdqa %4, [%3] ; load data from source2
- movq %5, [%3+16] ; load data from source2
+ movdqa %3, [%2] ; load data from source2
+ movq %4, [%2+16] ; load data from source2
%else
- movdqa %4, [%3] ; load data from source2
- movdqa %5, [%3+16] ; load data from source2
+ movdqa %3, [%2] ; load data from source2
+ movdqa %4, [%2+16] ; load data from source2
%endif
%endmacro
@@ -125,42 +162,6 @@ QPEL_TABLE 10, 4, w, sse4
mova m5, [FILTER+16] ; get 2 last values of filters
%endmacro
-%macro EPEL_HV_FILTER 1
-%ifdef PIC
- lea rfilterq, [hevc_epel_filters_sse4_%1]
-%else
- %define rfilterq hevc_epel_filters_sse4_%1
-%endif
- sub mxq, 1
- sub myq, 1
- shl mxq, 5 ; multiply by 32
- shl myq, 5 ; multiply by 32
- movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
- movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
- lea r3srcq, [srcstrideq*3]
-
-%ifdef PIC
- lea rfilterq, [hevc_epel_filters_sse4_10]
-%else
- %define rfilterq hevc_epel_filters_sse4_10
-%endif
- movdqa m12, [rfilterq + myq] ; get 2 first values of filters
- movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
-%endmacro
-
-%macro QPEL_FILTER 2
-%ifdef PIC
- lea rfilterq, [hevc_qpel_filters_sse4_%1]
-%else
- %define rfilterq hevc_qpel_filters_sse4_%1
-%endif
- lea %2q, [%2q*8-8]
- movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
- movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
- movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
- movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
-%endmacro
-
%macro EPEL_LOAD 4-5
%if %0 == 5
%define rfilterq %2
@@ -207,6 +208,125 @@ QPEL_TABLE 10, 4, w, sse4
%endmacro
+%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
+%if %2 == 8
+%if %1 > 8
+ punpckhbw m1, m0, m2
+ psllw m1, 14-%2
+%endif
+ punpcklbw m0, m2
+%endif
+ psllw m0, 14-%2
+%endmacro
+
+
+%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
+%if %1 == 8
+ pmaddubsw m0, %3 ;x1*c1+x2*c2
+ pmaddubsw m2, %4 ;x3*c3+x4*c4
+ paddw m0, m2
+%if %2 > 8
+ pmaddubsw m1, %3
+ pmaddubsw m3, %4
+ paddw m1, m3
+%endif
+%else
+ pmaddwd m0, %3
+ pmaddwd m2, %4
+ paddd m0, m2
+%if %2 > 4
+ pmaddwd m1, %3
+ pmaddwd m3, %4
+ paddd m1, m3
+%endif
+%if %1 != 8
+ psrad m0, %1-8
+ psrad m1, %1-8
+%endif
+ packssdw m0, m1
+%endif
+%endmacro
+
+
+%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+ paddsw %3, %5
+%if %1 > 8
+ paddsw %4, %6
+%endif
+ UNI_COMPUTE %1, %2, %3, %4, %7
+%endmacro
+
+%macro UNI_COMPUTE 5
+ pmulhrsw %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+ pmulhrsw %4, %5
+%endif
+%if %2 == 8
+ packuswb %3, %4
+%else
+ pminsw %3, [max_pixels_%2]
+ pmaxsw %3, [zero]
+%if %1 > 8
+ pminsw %4, [max_pixels_%2]
+ pmaxsw %4, [zero]
+%endif
+%endif
+%endmacro
+
+
+%if ARCH_X86_64
+
+%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+ movq %4, [%3] ; load data from source2
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+ movdqa %4, [%3] ; load data from source2
+%elif %1 <= 12
+ movdqa %4, [%3] ; load data from source2
+ movq %5, [%3+16] ; load data from source2
+%else
+ movdqa %4, [%3] ; load data from source2
+ movdqa %5, [%3+16] ; load data from source2
+%endif
+%endmacro
+
+%macro EPEL_HV_FILTER 1
+%ifdef PIC
+ lea rfilterq, [hevc_epel_filters_sse4_%1]
+%else
+ %define rfilterq hevc_epel_filters_sse4_%1
+%endif
+ sub mxq, 1
+ sub myq, 1
+ shl mxq, 5 ; multiply by 32
+ shl myq, 5 ; multiply by 32
+ movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
+ movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
+ lea r3srcq, [srcstrideq*3]
+
+%ifdef PIC
+ lea rfilterq, [hevc_epel_filters_sse4_10]
+%else
+ %define rfilterq hevc_epel_filters_sse4_10
+%endif
+ movdqa m12, [rfilterq + myq] ; get 2 first values of filters
+ movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
+%endmacro
+
+%macro QPEL_FILTER 2
+%ifdef PIC
+ lea rfilterq, [hevc_qpel_filters_sse4_%1]
+%else
+ %define rfilterq hevc_qpel_filters_sse4_%1
+%endif
+ lea %2q, [%2q*8-8]
+ movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
+ movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
+ movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
+ movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
+%endmacro
+
+
%macro QPEL_H_LOAD 4
%assign %%stride (%1+7)/8
%if %1 == 8
@@ -300,97 +420,6 @@ QPEL_TABLE 10, 4, w, sse4
%endif
%endmacro
-%macro PEL_10STORE2 3
- movd [%1], %2
-%endmacro
-%macro PEL_10STORE4 3
- movq [%1], %2
-%endmacro
-%macro PEL_10STORE6 3
- movq [%1], %2
- psrldq %2, 8
- movd [%1+8], %2
-%endmacro
-%macro PEL_10STORE8 3
- movdqa [%1], %2
-%endmacro
-%macro PEL_10STORE12 3
- movdqa [%1], %2
- movq [%1+16], %3
-%endmacro
-%macro PEL_10STORE16 3
- PEL_10STORE8 %1, %2, %3
- movdqa [%1+16], %3
-%endmacro
-
-%macro PEL_8STORE2 3
- pextrw [%1], %2, 0
-%endmacro
-%macro PEL_8STORE4 3
- movd [%1], %2
-%endmacro
-%macro PEL_8STORE6 3
- movd [%1], %2
- pextrw [%1+4], %2, 2
-%endmacro
-%macro PEL_8STORE8 3
- movq [%1], %2
-%endmacro
-%macro PEL_8STORE12 3
- movq [%1], %2
- psrldq %2, 8
- movd [%1+8], %2
-%endmacro
-%macro PEL_8STORE16 3
- movdqa [%1], %2
-%endmacro
-
-%macro LOOP_END 4
- lea %1q, [%1q+2*%2q] ; dst += dststride
- lea %3q, [%3q+ %4q] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
-%endmacro
-
-
-%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
-%if %2 == 8
-%if %1 > 8
- punpckhbw m1, m0, m2
- psllw m1, 14-%2
-%endif
- punpcklbw m0, m2
-%endif
- psllw m0, 14-%2
-%endmacro
-
-
-%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
-%if %1 == 8
- pmaddubsw m0, %3 ;x1*c1+x2*c2
- pmaddubsw m2, %4 ;x3*c3+x4*c4
- paddw m0, m2
-%if %2 > 8
- pmaddubsw m1, %3
- pmaddubsw m3, %4
- paddw m1, m3
-%endif
-%else
- pmaddwd m0, %3
- pmaddwd m2, %4
- paddd m0, m2
-%if %2 > 4
- pmaddwd m1, %3
- pmaddwd m3, %4
- paddd m1, m3
-%endif
-%if %1 != 8
- psrad m0, %1-8
- psrad m1, %1-8
-%endif
- packssdw m0, m1
-%endif
-%endmacro
%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
%ifdef PIC
@@ -423,238 +452,59 @@ QPEL_TABLE 10, 4, w, sse4
pmaddwd m3, [rfilterq + %3q*8+16]
pmaddwd m5, [rfilterq + %3q*8+32]
pmaddwd m7, [rfilterq + %3q*8+48]
- paddd m1, m3
- paddd m5, m7
- paddd m1, m5
-%if %2 != 8
- psrad m1, %2-8
-%endif
-%endif
- p%4 m0, m1
-%endif
-%endmacro
-
-%macro QPEL_COMPUTE 2 ; width, bitdepth
-%if %2 == 8
- pmaddubsw m0, m12 ;x1*c1+x2*c2
- pmaddubsw m2, m13 ;x3*c3+x4*c4
- pmaddubsw m4, m14 ;x5*c5+x6*c6
- pmaddubsw m6, m15 ;x7*c7+x8*c8
- paddw m0, m2
- paddw m4, m6
- paddw m0, m4
-%if %1 > 8
- pmaddubsw m1, m12
- pmaddubsw m3, m13
- pmaddubsw m5, m14
- pmaddubsw m7, m15
- paddw m1, m3
- paddw m5, m7
- paddw m1, m5
-%endif
-%else
- pmaddwd m0, m12
- pmaddwd m2, m13
- pmaddwd m4, m14
- pmaddwd m6, m15
- paddd m0, m2
- paddd m4, m6
- paddd m0, m4
-%if %2 != 8
- psrad m0, %2-8
-%endif
-%if %1 > 4
- pmaddwd m1, m12
- pmaddwd m3, m13
- pmaddwd m5, m14
- pmaddwd m7, m15
- paddd m1, m3
- paddd m5, m7
- paddd m1, m5
-%if %2 != 8
- psrad m1, %2-8
-%endif
-%endif
-%endif
-%endmacro
-
-%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
- paddsw %3, %5
-%if %1 > 8
- paddsw %4, %6
-%endif
- UNI_COMPUTE %1, %2, %3, %4, %7
-%endmacro
-
-%macro UNI_COMPUTE 5
- pmulhrsw %3, %5
-%if %1 > 8 || (%2 > 8 && %1 > 4)
- pmulhrsw %4, %5
-%endif
-%if %2 == 8
- packuswb %3, %4
-%else
- pminsw %3, [max_pixels_%2]
- pmaxsw %3, [zero]
-%if %1 > 8
- pminsw %4, [max_pixels_%2]
- pmaxsw %4, [zero]
-%endif
-%endif
-%endmacro
-
-INIT_XMM sse4 ; adds ff_ and _sse4 to function name
-; ******************************
-; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *_src, ptrdiff_t _srcstride,
-; int height, int mx, int my)
-; ******************************
-
-%macro HEVC_PUT_HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
- pxor m2, m2
-.loop
- SIMPLE_LOAD %1, %2, srcq, m0
- MC_PIXEL_COMPUTE %1, %2
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
- pxor m2, m2
-.loop
- SIMPLE_LOAD %1, %2, srcq, m0
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
- pxor m2, m2
- movdqa m5, [pw_bi_%2]
-.loop
- SIMPLE_LOAD %1, %2, srcq, m0
- SIMPLE_BILOAD %1, src2q, m3, m4
- MC_PIXEL_COMPUTE %1, %2
- BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-%endmacro
-
-
-; ******************************
-; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *_src, ptrdiff_t _srcstride,
-; int width, int height, int mx, int my,
-; int16_t* mcbuffer)
-; ******************************
-
-
-%macro HEVC_PUT_HEVC_EPEL 2
-cglobal hevc_put_hevc_epel_h%1_%2, 6, 6, 6, dst, dststride, src, srcstride, height, mx
-%assign %%stride ((%2 + 7)/8)
- EPEL_FILTER %2, mx
-.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, mx
-%assign %%stride ((%2 + 7)/8)
- movdqa m6, [pw_%2]
- EPEL_FILTER %2, mx
-.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-cglobal hevc_put_hevc_bi_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx
- mov heightd, mxm
- movdqa m6, [pw_bi_%2]
- EPEL_FILTER %2, height
- mov heightd, heightm
-.loop
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- SIMPLE_BILOAD %1, src2q, m2, m3
- BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- lea srcq, [srcq+srcstrideq] ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-; ******************************
-; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
-; uint8_t *_src, ptrdiff_t _srcstride,
-; int width, int height, int mx, int my,
-; int16_t* mcbuffer)
-; ******************************
-
-cglobal hevc_put_hevc_epel_v%1_%2, 5, 6, 6, dst, dststride, src, srcstride, height, r3src, my
- mov r5d, mym
- EPEL_FILTER %2, r5
- sub srcq, srcstrideq
-.loop
- EPEL_LOAD %2, srcq, srcstride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- PEL_10STORE%1 dstq, m0, m1
- lea dstq, [dstq+2*dststrideq]
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 6, 7, dst, dststride, src, srcstride, height, r3src, my
- mov r5d, mym
- EPEL_FILTER %2, r5
- movdqa m6, [pw_%2]
- sub srcq, srcstrideq
-.loop
- EPEL_LOAD %2, srcq, srcstride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-
-cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
- mov heightd, mym
- EPEL_FILTER %2, height
- movdqa m6, [pw_bi_%2]
- mov heightd, heightm
- sub srcq, srcstrideq
-.loop
- EPEL_LOAD %2, srcq, srcstride, %1, 1
- EPEL_COMPUTE %2, %1, m4, m5
- SIMPLE_BILOAD %1, src2q, m2, m3
- BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
- PEL_%2STORE%1 dstq, m0, m1
- lea dstq, [dstq+dststrideq] ; dst += dststride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+ p%4 m0, m1
+%endif
+%endmacro
+
+%macro QPEL_COMPUTE 2 ; width, bitdepth
+%if %2 == 8
+ pmaddubsw m0, m12 ;x1*c1+x2*c2
+ pmaddubsw m2, m13 ;x3*c3+x4*c4
+ pmaddubsw m4, m14 ;x5*c5+x6*c6
+ pmaddubsw m6, m15 ;x7*c7+x8*c8
+ paddw m0, m2
+ paddw m4, m6
+ paddw m0, m4
+%if %1 > 8
+ pmaddubsw m1, m12
+ pmaddubsw m3, m13
+ pmaddubsw m5, m14
+ pmaddubsw m7, m15
+ paddw m1, m3
+ paddw m5, m7
+ paddw m1, m5
+%endif
+%else
+ pmaddwd m0, m12
+ pmaddwd m2, m13
+ pmaddwd m4, m14
+ pmaddwd m6, m15
+ paddd m0, m2
+ paddd m4, m6
+ paddd m0, m4
+%if %2 != 8
+ psrad m0, %2-8
+%endif
+%if %1 > 4
+ pmaddwd m1, m12
+ pmaddwd m3, m13
+ pmaddwd m5, m14
+ pmaddwd m7, m15
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+%endif
%endmacro
@@ -1120,6 +970,37 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride
RET
%endmacro
+HEVC_PUT_HEVC_EPEL_HV 2, 8
+HEVC_PUT_HEVC_EPEL_HV 4, 8
+HEVC_PUT_HEVC_EPEL_HV 6, 8
+HEVC_PUT_HEVC_EPEL_HV 8, 8
+
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+
+HEVC_PUT_HEVC_QPEL 4, 8
+HEVC_PUT_HEVC_QPEL 8, 8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
+
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+%endif ; ARCH_X86_64
+
%macro WEIGHTING_FUNCS 2
%if WIN64 || ARCH_X86_32
cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
@@ -1172,6 +1053,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
jnz .loop ; height loop
RET
+%if ARCH_X86_64
cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
mov r6d, denomm
movd m2, wx0m ; WX0
@@ -1225,8 +1107,11 @@ cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2,
dec r6d ; cmp height
jnz .loop ; height loop
RET
+%endif ; ~ARCH_X86_64
+
%endmacro
+INIT_XMM sse4
WEIGHTING_FUNCS 2, 8
WEIGHTING_FUNCS 4, 8
WEIGHTING_FUNCS 6, 8
@@ -1237,6 +1122,52 @@ WEIGHTING_FUNCS 4, 10
WEIGHTING_FUNCS 6, 10
WEIGHTING_FUNCS 8, 10
+
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+ pxor m2, m2
+.loop
+ SIMPLE_LOAD %1, %2, srcq, m0
+ MC_PIXEL_COMPUTE %1, %2
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, dststride, src, srcstride
+ RET
+
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+ pxor m2, m2
+.loop
+ SIMPLE_LOAD %1, %2, srcq, m0
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ lea srcq, [srcq+srcstrideq] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
+ pxor m2, m2
+ movdqa m5, [pw_bi_%2]
+.loop
+ SIMPLE_LOAD %1, %2, srcq, m0
+ SIMPLE_BILOAD %1, src2q, m3, m4
+ MC_PIXEL_COMPUTE %1, %2
+ BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ lea srcq, [srcq+srcstrideq] ; src += srcstride
+ lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+%endmacro
+
+INIT_XMM sse4
HEVC_PUT_HEVC_PEL_PIXELS 2, 8
HEVC_PUT_HEVC_PEL_PIXELS 4, 8
HEVC_PUT_HEVC_PEL_PIXELS 6, 8
@@ -1250,6 +1181,112 @@ HEVC_PUT_HEVC_PEL_PIXELS 6, 10
HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int width, int height, int mx, int my,
+; int16_t* mcbuffer)
+; ******************************
+%macro HEVC_PUT_HEVC_EPEL 2
+cglobal hevc_put_hevc_epel_h%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, mx
+%assign %%stride ((%2 + 7)/8)
+ EPEL_FILTER %2, mx
+.loop
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 6
+ EPEL_COMPUTE %2, %1, m4, m5
+ PEL_10STORE%1 dstq, m0, m1
+ LOOP_END dst, dststride, src, srcstride
+ RET
+
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 6, 8, dst, dststride, src, srcstride, height, mx
+%assign %%stride ((%2 + 7)/8)
+ movdqa m6, [pw_%2]
+ EPEL_FILTER %2, mx
+.loop
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 7
+ EPEL_COMPUTE %2, %1, m4, m5
+ UNI_COMPUTE %1, %2, m0, m1, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ lea srcq, [srcq+srcstrideq] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 6, 7, 8, dst, dststride, src, srcstride, src2, src2stride,height, mx
+ mov heightd, mxm
+ movdqa m6, [pw_bi_%2]
+ EPEL_FILTER %2, height
+ mov heightd, heightm
+.loop
+ EPEL_LOAD %2, srcq-%%stride, %%stride, %1, 7
+ EPEL_COMPUTE %2, %1, m4, m5
+ SIMPLE_BILOAD %1, src2q, m2, m3
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ lea srcq, [srcq+srcstrideq] ; src += srcstride
+ lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+; uint8_t *_src, ptrdiff_t _srcstride,
+; int width, int height, int mx, int my,
+; int16_t* mcbuffer)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 5, 6, 7, dst, dststride, src, srcstride, height, r3src, my
+ mov r5d, mym
+ EPEL_FILTER %2, r5
+ sub srcq, srcstrideq
+.loop
+ EPEL_LOAD %2, srcq, srcstride, %1, 6
+ EPEL_COMPUTE %2, %1, m4, m5
+ PEL_10STORE%1 dstq, m0, m1
+ lea dstq, [dstq+2*dststrideq]
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 6, 8, dst, dststride, src, srcstride, height, r3src, my
+ mov r5d, mym
+ EPEL_FILTER %2, r5
+ movdqa m6, [pw_%2]
+ sub srcq, srcstrideq
+.loop
+ EPEL_LOAD %2, srcq, srcstride, %1, 7
+ EPEL_COMPUTE %2, %1, m4, m5
+ UNI_COMPUTE %1, %2, m0, m1, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 7, 8, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
+ mov heightd, mym
+ EPEL_FILTER %2, height
+ movdqa m6, [pw_bi_%2]
+ mov heightd, heightm
+ sub srcq, srcstrideq
+.loop
+ EPEL_LOAD %2, srcq, srcstride, %1, 7
+ EPEL_COMPUTE %2, %1, m4, m5
+ SIMPLE_BILOAD %1, src2q, m2, m3
+ BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
+ PEL_%2STORE%1 dstq, m0, m1
+ lea dstq, [dstq+dststrideq] ; dst += dststride
+ lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
+
+INIT_XMM sse4
HEVC_PUT_HEVC_EPEL 2, 8
HEVC_PUT_HEVC_EPEL 4, 8
HEVC_PUT_HEVC_EPEL 6, 8
@@ -1262,35 +1299,3 @@ HEVC_PUT_HEVC_EPEL 2, 10
HEVC_PUT_HEVC_EPEL 4, 10
HEVC_PUT_HEVC_EPEL 6, 10
HEVC_PUT_HEVC_EPEL 8, 10
-
-
-HEVC_PUT_HEVC_EPEL_HV 2, 8
-HEVC_PUT_HEVC_EPEL_HV 4, 8
-HEVC_PUT_HEVC_EPEL_HV 6, 8
-HEVC_PUT_HEVC_EPEL_HV 8, 8
-
-HEVC_PUT_HEVC_EPEL_HV 2, 10
-HEVC_PUT_HEVC_EPEL_HV 4, 10
-HEVC_PUT_HEVC_EPEL_HV 6, 10
-HEVC_PUT_HEVC_EPEL_HV 8, 10
-
-
-HEVC_PUT_HEVC_QPEL 4, 8
-HEVC_PUT_HEVC_QPEL 8, 8
-HEVC_PUT_HEVC_QPEL 12, 8
-HEVC_PUT_HEVC_QPEL 16, 8
-
-HEVC_PUT_HEVC_QPEL 4, 10
-HEVC_PUT_HEVC_QPEL 8, 10
-
-HEVC_PUT_HEVC_QPEL_HV 2, 8
-HEVC_PUT_HEVC_QPEL_HV 4, 8
-HEVC_PUT_HEVC_QPEL_HV 6, 8
-HEVC_PUT_HEVC_QPEL_HV 8, 8
-
-HEVC_PUT_HEVC_QPEL_HV 2, 10
-HEVC_PUT_HEVC_QPEL_HV 4, 10
-HEVC_PUT_HEVC_QPEL_HV 6, 10
-HEVC_PUT_HEVC_QPEL_HV 8, 10
-
-%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index c5a64c7..aca3754 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -33,6 +33,7 @@ dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+if(ARCH_X86_64) \
dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
@@ -100,6 +101,7 @@ EPEL_PROTOTYPES(epel_h , 10, sse4);
EPEL_PROTOTYPES(epel_v , 8, sse4);
EPEL_PROTOTYPES(epel_v , 10, sse4);
+#if ARCH_X86_64
EPEL_PROTOTYPES(epel_hv , 8, sse4);
EPEL_PROTOTYPES(epel_hv , 10, sse4);
@@ -114,6 +116,7 @@ QPEL_PROTOTYPES(qpel_v, 10, sse4);
QPEL_PROTOTYPES(qpel_hv, 8, sse4);
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+#endif
WEIGHTING_PROTOTYPES(8, sse4);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 30902be..44855d1 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -103,7 +103,7 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dst
mc_rep_bi_func(name, bitd, step, W, opt)
-#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#if HAVE_SSE4_EXTERNAL
mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
@@ -137,6 +137,7 @@ mc_rep_funcs(epel_v,10, 8, 32, sse4);
mc_rep_funcs(epel_v,10, 8, 24, sse4);
mc_rep_funcs(epel_v,10, 8, 16, sse4);
mc_rep_funcs(epel_v,10, 4, 12, sse4);
+# if ARCH_X86_64
mc_rep_funcs(epel_hv, 8, 8, 64, sse4);
mc_rep_funcs(epel_hv, 8, 8, 48, sse4);
mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
@@ -182,6 +183,7 @@ mc_rep_funcs(qpel_hv,10, 8, 32, sse4);
mc_rep_funcs(qpel_hv,10, 8, 24, sse4);
mc_rep_funcs(qpel_hv,10, 8, 16, sse4);
mc_rep_funcs(qpel_hv,10, 4, 12, sse4);
+# endif
#define mc_rep_uni_w(bitd, step, W, opt) \
void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
@@ -212,6 +214,7 @@ mc_rep_uni_w(10, 8, 32, sse4);
mc_rep_uni_w(10, 8, 48, sse4);
mc_rep_uni_w(10, 8, 64, sse4);
+# if ARCH_X86_64
#define mc_rep_bi_w(bitd, step, W, opt) \
void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
int16_t *_src2, ptrdiff_t _src2stride, int height, \
@@ -243,6 +246,7 @@ mc_rep_bi_w(10, 8, 24, sse4);
mc_rep_bi_w(10, 8, 32, sse4);
mc_rep_bi_w(10, 8, 48, sse4);
mc_rep_bi_w(10, 8, 64, sse4);
+# endif
#define mc_uni_w_func(name, bitd, W, opt) \
void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
@@ -272,11 +276,13 @@ mc_uni_w_funcs(epel_h, 8, sse4);
mc_uni_w_func(epel_h, 8, 6, sse4);
mc_uni_w_funcs(epel_v, 8, sse4);
mc_uni_w_func(epel_v, 8, 6, sse4);
+# if ARCH_X86_64
mc_uni_w_funcs(epel_hv, 8, sse4);
mc_uni_w_func(epel_hv, 8, 6, sse4);
mc_uni_w_funcs(qpel_h, 8, sse4);
mc_uni_w_funcs(qpel_v, 8, sse4);
mc_uni_w_funcs(qpel_hv, 8, sse4);
+# endif
mc_uni_w_funcs(pel_pixels, 10, sse4);
mc_uni_w_func(pel_pixels, 10, 6, sse4);
@@ -284,6 +290,7 @@ mc_uni_w_funcs(epel_h, 10, sse4);
mc_uni_w_func(epel_h, 10, 6, sse4);
mc_uni_w_funcs(epel_v, 10, sse4);
mc_uni_w_func(epel_v, 10, 6, sse4);
+# if ARCH_X86_64
mc_uni_w_funcs(epel_hv, 10, sse4);
mc_uni_w_func(epel_hv, 10, 6, sse4);
mc_uni_w_funcs(qpel_h, 10, sse4);
@@ -338,11 +345,12 @@ mc_bi_w_func(epel_hv, 10, 6, sse4);
mc_bi_w_funcs(qpel_h, 10, sse4);
mc_bi_w_funcs(qpel_v, 10, sse4);
mc_bi_w_funcs(qpel_hv, 10, sse4);
+# endif
-#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#endif //HAVE_SSE4_EXTERNAL
-#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
@@ -353,6 +361,7 @@ mc_bi_w_funcs(qpel_hv, 10, sse4);
PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
+ if(ARCH_X86_64) { \
PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
@@ -360,7 +369,8 @@ mc_bi_w_funcs(qpel_hv, 10, sse4);
PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
- PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
+ PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ); \
+ }
void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
@@ -376,17 +386,19 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
}
- if (EXTERNAL_SSE4(mm_flags) && ARCH_X86_64) {
+ if (EXTERNAL_SSE4(mm_flags)) {
EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
+#if ARCH_X86_64
EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
+#endif
}
} else if (bit_depth == 10) {
@@ -398,17 +410,19 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
}
- if (EXTERNAL_SSE4(mm_flags) && ARCH_X86_64) {
+ if (EXTERNAL_SSE4(mm_flags)) {
EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
+#if ARCH_X86_64
EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
+#endif
}
}
}
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list