[FFmpeg-cvslog] vp9/x86: intra prediction sse2/32bit support.
Ronald S. Bultje
git at videolan.org
Fri Dec 19 14:30:44 CET 2014
ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Mon Dec 15 22:13:52 2014 -0500| [bdc1e3e3b27d2b35ea88a964254e311d359aac69] | committer: Michael Niedermayer
vp9/x86: intra prediction sse2/32bit support.
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bdc1e3e3b27d2b35ea88a964254e311d359aac69
---
libavcodec/x86/constants.c | 2 +-
libavcodec/x86/constants.h | 2 +-
libavcodec/x86/vp9dsp_init.c | 169 +++++---
libavcodec/x86/vp9intrapred.asm | 902 ++++++++++++++++++++++++++++++---------
4 files changed, 805 insertions(+), 270 deletions(-)
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index aa3df00..d78b896 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -40,7 +40,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(8, const xmm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
0x0100010001000100ULL, 0x0100010001000100ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index e75fff9..1c24dda3 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -42,7 +42,7 @@ extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const xmm_reg ff_pw_255;
extern const xmm_reg ff_pw_512;
extern const xmm_reg ff_pw_1024;
extern const xmm_reg ff_pw_2048;
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 37173fb..7acf4f7 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -243,40 +243,58 @@ lpf_funcs(88, 16, avx);
void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
const uint8_t *l, const uint8_t *a)
-#define ipred_funcs(type, opt) \
-ipred_func(4, type, opt); \
-ipred_func(8, type, opt); \
-ipred_func(16, type, opt); \
-ipred_func(32, type, opt)
+ipred_func(8, v, mmx);
-ipred_funcs(dc, ssse3);
-ipred_funcs(dc_left, ssse3);
-ipred_funcs(dc_top, ssse3);
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
-#undef ipred_funcs
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
-ipred_func(8, v, mmx);
-ipred_func(16, v, sse2);
-ipred_func(32, v, sse2);
-
-#define ipred_func_set(size, type, opt1, opt2) \
-ipred_func(size, type, opt1); \
-ipred_func(size, type, opt2)
-
-#define ipred_funcs(type, opt1, opt2) \
-ipred_func(4, type, opt1); \
-ipred_func_set(8, type, opt1, opt2); \
-ipred_func_set(16, type, opt1, opt2); \
-ipred_func_set(32, type, opt1, opt2)
-
-ipred_funcs(h, ssse3, avx);
-ipred_funcs(tm, ssse3, avx);
-ipred_funcs(dl, ssse3, avx);
-ipred_funcs(dr, ssse3, avx);
-ipred_funcs(hu, ssse3, avx);
-ipred_funcs(hd, ssse3, avx);
-ipred_funcs(vl, ssse3, avx);
-ipred_funcs(vr, ssse3, avx);
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
+
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
+
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
+
+ipred_func(4, h, sse2);
+
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
+
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
+
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
ipred_func(32, dc, avx2);
ipred_func(32, dc_left, avx2);
@@ -285,9 +303,14 @@ ipred_func(32, v, avx2);
ipred_func(32, h, avx2);
ipred_func(32, tm, avx2);
-#undef ipred_funcs
-#undef ipred_func_set
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
#endif /* HAVE_YASM */
@@ -340,23 +363,32 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
} \
} while (0)
-#define init_ipred(tx, sz, opt) do { \
- dsp->intra_pred[tx][HOR_PRED] = ff_vp9_ipred_h_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = ff_vp9_ipred_dl_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = ff_vp9_ipred_dr_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][HOR_DOWN_PRED] = ff_vp9_ipred_hd_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][VERT_LEFT_PRED] = ff_vp9_ipred_vl_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][HOR_UP_PRED] = ff_vp9_ipred_hu_##sz##x##sz##_##opt; \
- if (ARCH_X86_64 || tx != TX_32X32) { \
- dsp->intra_pred[tx][VERT_RIGHT_PRED] = ff_vp9_ipred_vr_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][TM_VP8_PRED] = ff_vp9_ipred_tm_##sz##x##sz##_##opt; \
- } \
+#define init_ipred(sz, opt, t, e) \
+ dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+ init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+ init_ipred(sz, opt, hd, HOR_DOWN); \
+ init_ipred(sz, opt, vl, VERT_LEFT); \
+ init_ipred(sz, opt, hu, HOR_UP); \
+ init_ipred(sz, opt, tm, TM_VP8); \
+ init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+ init_dir_tm_ipred(sz, opt); \
+ init_ipred(sz, opt, h, HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dc, DC); \
+ init_ipred(sz, opt, dc_left, LEFT_DC); \
+ init_ipred(sz, opt, dc_top, TOP_DC); \
} while (0)
-#define init_dc_ipred(tx, sz, opt) do { \
- init_ipred(tx, sz, opt); \
- dsp->intra_pred[tx][DC_PRED] = ff_vp9_ipred_dc_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_##sz##x##sz##_##opt; \
- dsp->intra_pred[tx][TOP_DC_PRED] = ff_vp9_ipred_dc_top_##sz##x##sz##_##opt; \
+#define init_all_ipred(sz, opt) do { \
+ init_dc_ipred(sz, opt); \
+ init_dir_tm_h_ipred(sz, opt); \
} while (0)
if (EXTERNAL_MMX(cpu_flags)) {
@@ -366,7 +398,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
- dsp->intra_pred[TX_8X8][VERT_PRED] = ff_vp9_ipred_v_8x8_mmx;
+ init_ipred(8, mmx, v, VERT);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
@@ -375,12 +407,17 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(4, 1, 4, avg, mmxext);
init_fpel(3, 1, 8, avg, mmxext);
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+ init_dc_ipred(4, mmxext);
+ init_dc_ipred(8, mmxext);
+ init_dir_tm_ipred(4, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
init_fpel(2, 0, 16, put, sse);
init_fpel(1, 0, 32, put, sse);
init_fpel(0, 0, 64, put, sse);
+ init_ipred(16, sse, v, VERT);
+ init_ipred(32, sse, v, VERT);
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -405,8 +442,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
- dsp->intra_pred[TX_16X16][VERT_PRED] = ff_vp9_ipred_v_16x16_sse2;
- dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_sse2;
+ init_dc_ipred(16, sse2);
+ init_dc_ipred(32, sse2);
+ init_dir_tm_h_ipred(8, sse2);
+ init_dir_tm_h_ipred(16, sse2);
+ init_dir_tm_h_ipred(32, sse2);
+ init_ipred(4, sse2, h, HOR);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
@@ -429,10 +470,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
init_lpf(ssse3);
- init_dc_ipred(TX_4X4, 4, ssse3);
- init_dc_ipred(TX_8X8, 8, ssse3);
- init_dc_ipred(TX_16X16, 16, ssse3);
- init_dc_ipred(TX_32X32, 32, ssse3);
+ init_all_ipred(4, ssse3);
+ init_all_ipred(8, ssse3);
+ init_all_ipred(16, ssse3);
+ init_all_ipred(32, ssse3);
}
if (EXTERNAL_AVX(cpu_flags)) {
@@ -451,9 +492,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(1, 0, 32, put, avx);
init_fpel(0, 0, 64, put, avx);
init_lpf(avx);
- init_ipred(TX_8X8, 8, avx);
- init_ipred(TX_16X16, 16, avx);
- init_ipred(TX_32X32, 32, avx);
+ init_dir_tm_h_ipred(8, avx);
+ init_dir_tm_h_ipred(16, avx);
+ init_dir_tm_h_ipred(32, avx);
+ init_ipred(32, avx, v, VERT);
}
if (EXTERNAL_AVX2(cpu_flags)) {
@@ -465,12 +507,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_subpel3_32_64(1, avg, avx2);
#endif
}
- dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
- dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
- dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;
- dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2;
- dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2;
- dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2;
+ init_dc_ipred(32, avx2);
+ init_ipred(32, avx2, h, HOR);
+ init_ipred(32, avx2, tm, TM_VP8);
}
#undef init_fpel
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
index 6621226..169676f 100644
--- a/libavcodec/x86/vp9intrapred.asm
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -66,11 +66,23 @@ pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
pb_2: times 32 db 2
pb_15: times 16 db 15
+pb_15x0_1xm1: times 15 db 0
+ db -1
pb_0to2_5x3: db 0, 1, 2
times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+ times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+ times 2 db -1
cextern pb_1
cextern pb_3
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
cextern pw_512
cextern pw_1024
cextern pw_2048
@@ -80,14 +92,21 @@ SECTION .text
; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
-INIT_MMX ssse3
+%macro DC_4to8_FUNCS 0
cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
movd m0, [lq]
punpckldq m0, [aq]
pxor m1, m1
psadbw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_4096]
pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
@@ -95,7 +114,6 @@ cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
movd [dstq+strideq*1], m0
RET
-INIT_MMX ssse3
cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
movq m0, [lq]
movq m1, [aq]
@@ -105,8 +123,15 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
psadbw m0, m2
psadbw m1, m2
paddw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_2048]
pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
@@ -117,8 +142,14 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
RET
+%endmacro
-INIT_XMM ssse3
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
mova m0, [lq]
mova m1, [aq]
@@ -130,8 +161,16 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
paddw m0, m1
movhlps m1, m0
paddw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_1024]
pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
mov cntd, 4
.loop:
mova [dstq+strideq*0], m0
@@ -143,7 +182,6 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
jg .loop
RET
-INIT_XMM ssse3
cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
mova m0, [lq]
mova m1, [lq+16]
@@ -161,8 +199,16 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
paddw m0, m2
movhlps m1, m0
paddw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_512]
pshufb m0, m4
+%else
+ paddw m0, [pw_32]
+ psraw m0, 6
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
mov cntd, 8
.loop:
mova [dstq+strideq*0+ 0], m0
@@ -177,6 +223,12 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
dec cntd
jg .loop
RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
@@ -214,14 +266,20 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
-%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
-INIT_MMX ssse3
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
movd m0, [%2q]
pxor m1, m1
psadbw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_8192]
pshufb m0, m1
+%else
+ paddw m0, [pw_2]
+ psraw m0, 2
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
@@ -229,15 +287,21 @@ cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
movd [dstq+strideq*1], m0
RET
-INIT_MMX ssse3
cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
movq m0, [%2q]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pxor m1, m1
psadbw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_4096]
pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
@@ -248,8 +312,16 @@ cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
RET
+%endmacro
-INIT_XMM ssse3
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
mova m0, [%2q]
DEFINE_ARGS dst, stride, stride3, cnt
@@ -258,8 +330,16 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
psadbw m0, m2
movhlps m1, m0
paddw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_2048]
pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
mov cntd, 4
.loop:
mova [dstq+strideq*0], m0
@@ -271,7 +351,6 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
jg .loop
RET
-INIT_XMM ssse3
cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
mova m0, [%2q]
mova m1, [%2q+16]
@@ -283,8 +362,16 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
paddw m0, m1
movhlps m1, m0
paddw m0, m1
+%if cpuflag(ssse3)
pmulhrsw m0, [pw_1024]
pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
mov cntd, 8
.loop:
mova [dstq+strideq*0+ 0], m0
@@ -299,9 +386,17 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
dec cntd
jg .loop
RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
mova m0, [%2q]
DEFINE_ARGS dst, stride, stride3, cnt
@@ -332,8 +427,9 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
%endif
%endmacro
-DC_1D_FUNCS top, a
-DC_1D_FUNCS left, l
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top, a
+DC_1D_AVX2_FUNCS left, l
; v
@@ -353,7 +449,7 @@ cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
movq [dstq+stride3q ], m0
RET
-INIT_XMM sse2
+INIT_XMM sse
cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3, cnt
@@ -369,7 +465,7 @@ cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
jg .loop
RET
-INIT_XMM sse2
+INIT_XMM sse
cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
mova m0, [aq]
mova m1, [aq+16]
@@ -390,8 +486,7 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
jg .loop
RET
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
+INIT_YMM avx
cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3, cnt
@@ -411,14 +506,20 @@ cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
dec cntd
jg .loop
RET
-%endif
; h
-INIT_XMM ssse3
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
movd m0, [lq]
+%if cpuflag(ssse3)
pshufb m0, [pb_4x3_4x2_4x1_4x0]
+%else
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0123
+ punpcklwd m0, m0
+%endif
lea stride3q, [strideq*3]
movd [dstq+strideq*0], m0
psrldq m0, 4
@@ -428,18 +529,26 @@ cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
psrldq m0, 4
movd [dstq+stride3q ], m0
RET
+%endif
-%macro H_XMM_FUNCS 1
-INIT_XMM %1
-cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
mova m2, [pb_8x1_8x0]
mova m3, [pb_8x3_8x2]
+%endif
lea stride3q, [strideq*3]
mov cntq, 1
.loop:
movd m0, [lq+cntq*4]
+%if cpuflag(ssse3)
pshufb m1, m0, m3
pshufb m0, m2
+%else
+ punpcklbw m0, m0
+ punpcklwd m0, m0
+ pshufd m1, m0, q2233
+ pshufd m0, m0, q0011
+%endif
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
movq [dstq+strideq*2], m0
@@ -449,22 +558,35 @@ cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
jge .loop
RET
-INIT_XMM %1
-cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
mova m5, [pb_1]
mova m6, [pb_2]
mova m7, [pb_3]
pxor m4, m4
+%endif
lea stride3q, [strideq*3]
mov cntq, 3
.loop:
movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
pshufb m0, m3, m7
pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
+%if cpuflag(ssse3)
pshufb m2, m3, m5
pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
@@ -472,24 +594,37 @@ cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
jge .loop
RET
-INIT_XMM %1
-cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
mova m5, [pb_1]
mova m6, [pb_2]
mova m7, [pb_3]
pxor m4, m4
+%endif
lea stride3q, [strideq*3]
mov cntq, 7
.loop:
movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
pshufb m0, m3, m7
pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
pshufb m2, m3, m5
pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
mova [dstq+strideq*2+ 0], m2
mova [dstq+strideq*2+16], m2
mova [dstq+stride3q + 0], m3
@@ -500,8 +635,12 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
RET
%endmacro
-H_XMM_FUNCS ssse3
-H_XMM_FUNCS avx
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
@@ -531,83 +670,124 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
; tm
-INIT_MMX ssse3
+%macro TM_MMX_FUNCS 0
cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
pxor m1, m1
- pinsrw m2, [aq-1], 0
movd m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
mova m3, [pw_m256]
- mova m4, [pw_m255]
+ mova m1, [pw_m255]
pshufb m2, m3
- punpcklbw m0, m1
+%else
+ punpcklbw m2, m1
+ pshufw m2, m2, q0000
+%endif
psubw m0, m2
mov cntq, 1
.loop:
pinsrw m2, [lq+cntq*2], 0
- pshufb m1, m2, m4
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
pshufb m2, m3
- paddw m1, m0
+%else
+ punpcklbw m2, m1
+ pshufw m4, m2, q1111
+ pshufw m2, m2, q0000
+%endif
+ paddw m4, m0
paddw m2, m0
- packuswb m1, m1
+ packuswb m4, m4
packuswb m2, m2
- movd [dstq+strideq*0], m1
+ movd [dstq+strideq*0], m4
movd [dstq+strideq*1], m2
lea dstq, [dstq+strideq*2]
dec cntq
jge .loop
RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
-%macro TM_XMM_FUNCS 1
-INIT_XMM %1
+%macro TM_XMM_FUNCS 0
cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
pxor m1, m1
- pinsrw m2, [aq-1], 0
movh m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
mova m3, [pw_m256]
- mova m4, [pw_m255]
+ mova m1, [pw_m255]
pshufb m2, m3
- punpcklbw m0, m1
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
psubw m0, m2
mov cntq, 3
.loop:
pinsrw m2, [lq+cntq*2], 0
- pshufb m1, m2, m4
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
pshufb m2, m3
- paddw m1, m0
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m4, m2, q1111
+ pshufd m2, m2, q0000
+%endif
+ paddw m4, m0
paddw m2, m0
- packuswb m1, m2
- movh [dstq+strideq*0], m1
- movhps [dstq+strideq*1], m1
+ packuswb m4, m2
+ movh [dstq+strideq*0], m4
+ movhps [dstq+strideq*1], m4
lea dstq, [dstq+strideq*2]
dec cntq
jge .loop
RET
-INIT_XMM %1
cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
pxor m3, m3
- pinsrw m2, [aq-1], 0
mova m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
mova m4, [pw_m256]
- mova m5, [pw_m255]
+ mova m3, [pw_m255]
pshufb m2, m4
- punpckhbw m1, m0, m3
- punpcklbw m0, m3
+%else
+ punpcklbw m2, m3
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
psubw m1, m2
psubw m0, m2
mov cntq, 7
.loop:
pinsrw m7, [lq+cntq*2], 0
- pshufb m3, m7, m5
+%if cpuflag(ssse3)
+ pshufb m5, m7, m3
pshufb m7, m4
- paddw m2, m3, m0
- paddw m3, m1
+%else
+ punpcklbw m7, m3
+ punpcklwd m7, m7
+ pshufd m5, m7, q1111
+ pshufd m7, m7, q0000
+%endif
+ paddw m2, m5, m0
+ paddw m5, m1
paddw m6, m7, m0
paddw m7, m1
- packuswb m2, m3
+ packuswb m2, m5
packuswb m6, m7
mova [dstq+strideq*0], m2
mova [dstq+strideq*1], m6
@@ -617,16 +797,32 @@ cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
RET
%if ARCH_X86_64
-INIT_XMM %1
-cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
pxor m5, m5
pinsrw m4, [aq-1], 0
mova m0, [aq]
mova m2, [aq+16]
DEFINE_ARGS dst, stride, l, cnt
- mova m8, [pw_m256]
- mova m9, [pw_m255]
- pshufb m4, m8
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+ mova m12, [pw_m256]
+ mova m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+ pshufb m4, pw_m256_reg
+%else
+ punpcklbw m4, m5
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+%endif
punpckhbw m1, m0, m5
punpckhbw m3, m2, m5
punpcklbw m0, m5
@@ -635,36 +831,72 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
psubw m0, m4
psubw m3, m4
psubw m2, m4
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%else
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+ mova [rsp+3*16], m3
+%endif
mov cntq, 15
.loop:
- pinsrw m13, [lq+cntq*2], 0
- pshufb m7, m13, m9
- pshufb m13, m8
- paddw m4, m7, m0
- paddw m5, m7, m1
- paddw m6, m7, m2
- paddw m7, m3
- paddw m10, m13, m0
- paddw m11, m13, m1
- paddw m12, m13, m2
- paddw m13, m3
+ pinsrw m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m7, m3, pw_m255_reg
+ pshufb m3, pw_m256_reg
+%else
+ pxor m7, m7
+ punpcklbw m3, m7
+ punpcklwd m3, m3
+ pshufd m7, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+%if ARCH_X86_64
+ paddw m4, m7, m8
+ paddw m5, m7, m9
+ paddw m6, m7, m10
+ paddw m7, m11
+ paddw m0, m3, m8
+ paddw m1, m3, m9
+ paddw m2, m3, m10
+ paddw m3, m11
+%else
+ paddw m4, m7, [rsp+0*16]
+ paddw m5, m7, [rsp+1*16]
+ paddw m6, m7, [rsp+2*16]
+ paddw m7, [rsp+3*16]
+ paddw m0, m3, [rsp+0*16]
+ paddw m1, m3, [rsp+1*16]
+ paddw m2, m3, [rsp+2*16]
+ paddw m3, [rsp+3*16]
+%endif
packuswb m4, m5
packuswb m6, m7
- packuswb m10, m11
- packuswb m12, m13
+ packuswb m0, m1
+ packuswb m2, m3
mova [dstq+strideq*0+ 0], m4
mova [dstq+strideq*0+16], m6
- mova [dstq+strideq*1+ 0], m10
- mova [dstq+strideq*1+16], m12
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m2
lea dstq, [dstq+strideq*2]
dec cntq
jge .loop
RET
-%endif
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
%endmacro
-TM_XMM_FUNCS ssse3
-TM_XMM_FUNCS avx
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
@@ -711,11 +943,20 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
pavgb m%1, m%2
%endmacro
-INIT_MMX ssse3
+%macro DL_MMX_FUNCS 0
cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
movq m1, [aq]
+%if cpuflag(ssse3)
pshufb m0, m1, [pb_0to5_2x7]
pshufb m2, m1, [pb_2to6_3x7]
+%else
+ punpckhbw m3, m1, m1 ; 44556677
+ pand m0, m1, [pb_6xm1_2x0] ; 012345__
+ pand m3, [pb_6x0_2xm1] ; ______77
+ psrlq m2, m1, 16 ; 234567__
+ por m0, m3 ; 01234577
+ por m2, m3 ; 23456777
+%endif
psrlq m1, 8
LOWPASS 0, 1, 2, 3
@@ -728,15 +969,29 @@ cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
movd [dstq+strideq*0], m0
movd [dstq+strideq*2], m1
RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
-%macro DL_XMM_FUNCS 1
-INIT_XMM %1
+%macro DL_XMM_FUNCS 0
cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
movq m0, [aq]
lea stride5q, [strideq*5]
+%if cpuflag(ssse3)
pshufb m1, m0, [pb_1to6_10x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7
+%endif
+ shufps m0, m1, q3310
+%if notcpuflag(ssse3)
+ psrldq m1, m0, 1
+ shufps m1, m0, q3210
+%endif
psrldq m2, m1, 1
- shufps m0, m1, q3210
LOWPASS 0, 1, 2, 3
pshufd m1, m0, q3321
@@ -757,46 +1012,72 @@ cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
movq [dstq+stride5q ], m1
RET
-INIT_XMM %1
cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
- mova m5, [pb_1toE_2xF]
mova m0, [aq]
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
pshufb m1, m0, m5
pshufb m2, m1, m5
pshufb m4, m0, [pb_15]
+%else
+ pand m5, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m5 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m5 ; 23456789ABCDEFFF
+ pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF
+%endif
LOWPASS 0, 1, 2, 3
DEFINE_ARGS dst, stride, cnt, stride9
- lea stride9q, [strideq*3]
+ lea stride9q, [strideq+strideq*8]
mov cntd, 4
- lea stride9q, [stride9q*3]
.loop:
movhlps m4, m0
mova [dstq+strideq*0], m0
+%if cpuflag(ssse3)
pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
mova [dstq+strideq*8], m4
movhlps m4, m0
mova [dstq+strideq*1], m0
+%if cpuflag(ssse3)
pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
mova [dstq+stride9q ], m4
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
-INIT_XMM %1
cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
- mova m5, [pb_1toE_2xF]
mova m0, [aq]
mova m1, [aq+16]
- palignr m2, m1, m0, 1
- palignr m3, m1, m0, 2
+ PALIGNR m2, m1, m0, 1, m4
+ PALIGNR m3, m1, m0, 2, m4
LOWPASS 0, 2, 3, 4
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
pshufb m2, m1, m5
pshufb m3, m2, m5
pshufb m6, m1, [pb_15]
- LOWPASS 1, 2, 3, 4
mova m7, m6
+%else
+ pand m5, m1, [pb_15x0_1xm1] ; _______________F
+ psrldq m2, m1, 1 ; 123456789ABCDEF_
+ por m2, m5 ; 123456789ABCDEFF
+ psrldq m3, m2, 1 ; 23456789ABCDEFF_
+ por m3, m5 ; 23456789ABCDEFFF
+ pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF
+ pshufd m6, m7, q3333
+%endif
+ LOWPASS 1, 2, 3, 4
lea dst16q, [dstq +strideq*8]
mov cntd, 8
lea dst16q, [dst16q+strideq*8]
@@ -814,10 +1095,17 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
%if cpuflag(avx)
vpalignr m0, m1, m0, 1
pshufb m1, m5
-%else
+%elif cpuflag(ssse3)
palignr m2, m1, m0, 1
pshufb m1, m5
mova m0, m2
+%else
+ mova m4, m1
+ psrldq m0, 1
+ pslldq m4, 15
+ psrldq m1, 1
+ por m0, m4
+ por m1, m5
%endif
add dstq, strideq
add dst16q, strideq
@@ -826,19 +1114,23 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
RET
%endmacro
-DL_XMM_FUNCS ssse3
-DL_XMM_FUNCS avx
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
; dr
-INIT_MMX ssse3
+%macro DR_MMX_FUNCS 0
cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
movd m0, [lq]
punpckldq m0, [aq-1]
movd m1, [aq+3]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
- palignr m1, m0, 1
+ PALIGNR m1, m0, 1, m3
psrlq m2, m1, 8
LOWPASS 0, 1, 2, 3
@@ -850,9 +1142,14 @@ cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
psrlq m0, 8
movd [dstq+strideq*0], m0
RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
-%macro DR_XMM_FUNCS 1
-INIT_XMM %1
+%macro DR_XMM_FUNCS 0
cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
movq m1, [lq]
movhps m1, [aq-1]
@@ -860,7 +1157,7 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pslldq m0, m1, 1
- palignr m2, m1, 1
+ PALIGNR m2, m1, 1, m3
LOWPASS 0, 1, 2, 3
movhps [dstq+strideq*0], m0
@@ -881,7 +1178,6 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
movhps [dstq+stride3q ], m0
RET
-INIT_XMM %1
cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
mova m1, [lq]
movu m2, [aq-1]
@@ -890,30 +1186,29 @@ cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
lea stride9q, [strideq *3]
mov cntd, 4
lea stride9q, [stride9q*3]
- palignr m4, m2, 1
- palignr m3, m2, m1, 15
+ PALIGNR m4, m2, 1, m5
+ PALIGNR m3, m2, m1, 15, m5
LOWPASS 3, 2, 4, 5
pslldq m0, m1, 1
- palignr m2, m1, 1
+ PALIGNR m2, m1, 1, m4
LOWPASS 0, 1, 2, 4
.loop:
mova [dstq+strideq*0 ], m3
movhps [dstq+strideq*8+0], m0
movq [dstq+strideq*8+8], m3
- palignr m3, m0, 15
+ PALIGNR m3, m0, 15, m1
pslldq m0, 1
mova [dstq+strideq*1 ], m3
movhps [dstq+stride9q +0], m0
movq [dstq+stride9q +8], m3
- palignr m3, m0, 15
+ PALIGNR m3, m0, 15, m1
pslldq m0, 1
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
-INIT_XMM %1
cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
mova m1, [lq]
mova m2, [lq+16]
@@ -922,16 +1217,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
movd m5, [aq+31]
DEFINE_ARGS dst, stride, stride8, cnt
lea stride8q, [strideq*8]
- palignr m5, m4, 1
- palignr m6, m4, m3, 15
+ PALIGNR m5, m4, 1, m7
+ PALIGNR m6, m4, m3, 15, m7
LOWPASS 5, 4, 6, 7
- palignr m4, m3, 1
- palignr m6, m3, m2, 15
+ PALIGNR m4, m3, 1, m7
+ PALIGNR m6, m3, m2, 15, m7
LOWPASS 4, 3, 6, 7
- palignr m3, m2, 1
- palignr m6, m2, m1, 15
+ PALIGNR m3, m2, 1, m7
+ PALIGNR m6, m2, m1, 15, m7
LOWPASS 3, 2, 6, 7
- palignr m2, m1, 1
+ PALIGNR m2, m1, 1, m6
pslldq m0, m1, 1
LOWPASS 2, 1, 0, 6
mov cntd, 16
@@ -942,9 +1237,9 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
mova [dstq+stride8q*0+16], m5
mova [dstq+stride8q*2+ 0], m3
mova [dstq+stride8q*2+16], m4
- palignr m5, m4, 15
- palignr m4, m3, 15
- palignr m3, m2, 15
+ PALIGNR m5, m4, 15, m6
+ PALIGNR m4, m3, 15, m6
+ PALIGNR m3, m2, 15, m6
pslldq m2, 1
add dstq, strideq
dec cntd
@@ -952,12 +1247,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
RET
%endmacro
-DR_XMM_FUNCS ssse3
-DR_XMM_FUNCS avx
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
; vl
-INIT_MMX ssse3
+INIT_MMX mmxext
cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
movq m0, [aq]
psrlq m1, m0, 8
@@ -973,11 +1272,16 @@ cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
movd [dstq+strideq*1], m2
RET
-%macro VL_XMM_FUNCS 1
-INIT_XMM %1
+%macro VL_XMM_FUNCS 0
cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
movq m0, [aq]
+%if cpuflag(ssse3)
pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0
+ punpckhwd m1, m1
+ shufps m0, m1, q3310
+%endif
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
psrldq m1, m0, 1
@@ -1002,48 +1306,82 @@ cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
movq [dstq+stride3q ], m2
RET
-INIT_XMM %1
cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
mova m0, [aq]
- mova m4, [pb_1toE_2xF]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
pshufb m1, m0, m4
pshufb m2, m1, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m4 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m4 ; 23456789ABCDEFFF
+%endif
LOWPASS 2, 1, 0, 3
pavgb m1, m0
mov cntd, 4
.loop:
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
+%if cpuflag(ssse3)
pshufb m1, m4
pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
mova [dstq+strideq*2], m1
mova [dstq+stride3q ], m2
+%if cpuflag(ssse3)
pshufb m1, m4
pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
-INIT_XMM %1
cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
mova m0, [aq]
mova m5, [aq+16]
- mova m4, [pb_1toE_2xF]
DEFINE_ARGS dst, stride, dst16, cnt
- palignr m2, m5, m0, 1
- palignr m3, m5, m0, 2
+ PALIGNR m2, m5, m0, 1, m4
+ PALIGNR m3, m5, m0, 2, m4
lea dst16q, [dstq +strideq*8]
LOWPASS 3, 2, 0, 6
pavgb m2, m0
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
pshufb m0, m5, m4
pshufb m1, m0, m4
+%else
+ pand m4, m5, [pb_15x0_1xm1] ; _______________F
+ psrldq m0, m5, 1 ; 123456789ABCDEF_
+ por m0, m4 ; 123456789ABCDEFF
+ psrldq m1, m0, 1 ; 23456789ABCDEFF_
+ por m1, m4 ; 23456789ABCDEFFF
+%endif
lea dst16q, [dst16q+strideq*8]
LOWPASS 1, 0, 5, 6
pavgb m0, m5
+%if cpuflag(ssse3)
pshufb m5, [pb_15]
+%else
+ punpckhbw m5, m4, m4
+ pshufhw m5, m5, q3333
+ punpckhqdq m5, m5
+%endif
mov cntd, 8
.loop:
@@ -1056,10 +1394,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
%if cpuflag(avx)
palignr %2, %3, %2, 1
pshufb %3, m4
-%else
+%elif cpuflag(ssse3)
palignr m6, %3, %2, 1
pshufb %3, m4
mova %2, m6
+%else
+ pslldq m6, %3, 15
+ psrldq %3, 1
+ psrldq %2, 1
+ por %3, m4
+ por %2, m6
%endif
%endmacro
@@ -1072,12 +1416,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
RET
%endmacro
-VL_XMM_FUNCS ssse3
-VL_XMM_FUNCS avx
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
; vr
-INIT_MMX ssse3
+%macro VR_MMX_FUNCS 0
cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
movq m1, [aq-1]
punpckldq m2, [lq]
@@ -1085,7 +1433,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pavgb m0, m1
- palignr m1, m2, 5
+ PALIGNR m1, m2, 5, m3
psrlq m2, m1, 8
psllq m3, m1, 8
LOWPASS 2, 1, 3, 4
@@ -1095,6 +1443,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
; IABC | m0 contains ABCDxxxx
; JEFG | m2 contains xJIEFGHx
+%if cpuflag(ssse3)
punpckldq m0, m2
pshufb m2, [pb_13456_3xm1]
movd [dstq+strideq*0], m0
@@ -1103,10 +1452,26 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
psrlq m2, 8
movd [dstq+strideq*2], m0
movd [dstq+strideq*1], m2
+%else
+ psllq m1, m2, 40
+ psrlq m2, 24
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m2
+ PALIGNR m0, m1, 7, m3
+ psllq m1, 8
+ PALIGNR m2, m1, 7, m3
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m2
+%endif
RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
-%macro VR_XMM_FUNCS 1
-INIT_XMM %1
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
movu m1, [aq-1]
movhps m2, [lq]
@@ -1114,7 +1479,7 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pavgb m0, m1
- palignr m1, m2, 9
+ PALIGNR m1, m2, 9, m3
pslldq m2, m1, 1
pslldq m3, m1, 2
LOWPASS 1, 2, 3, 4
@@ -1128,83 +1493,118 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
; USQABCDE
; VTRIJKLM
+%if cpuflag(ssse3)
punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
movq [dstq+strideq*0], m0
- pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
movhps [dstq+strideq*1], m1
- pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+%if cpuflag(ssse3)
+ pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
+ pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+%else
+ psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx
+ pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx
+ packuswb m3, m2 ; xVTRxxxxxUSQxxxx
+ pslldq m3, 4 ; xxxxxVTRxxxxxUSQ
+ PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG
+ psrldq m1, 8
+ pslldq m3, 8
+ PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO
+%endif
movhps [dstq+strideq*2], m0
- pslldq m0, 1
movhps [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
+ pslldq m0, 1
pslldq m1, 1
movhps [dstq+strideq*0], m0
- pslldq m0, 1
movhps [dstq+strideq*1], m1
+ pslldq m0, 1
pslldq m1, 1
movhps [dstq+strideq*2], m0
movhps [dstq+stride3q ], m1
RET
-INIT_XMM %1
-cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
mova m0, [aq]
movu m1, [aq-1]
mova m2, [lq]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
- palignr m3, m1, m2, 15
+ PALIGNR m3, m1, m2, 15, m6
LOWPASS 3, 1, 0, 4
pavgb m0, m1
- palignr m1, m2, 1
+ PALIGNR m1, m2, 1, m6
pslldq m4, m2, 1
LOWPASS 1, 2, 4, 5
+%if cpuflag(ssse3)
pshufb m1, [pb_02468ACE_13579BDF]
+%else
+ psrlw m5, m1, 8
+ pand m1, [pw_255]
+ packuswb m1, m5
+%endif
mov cntd, 4
.loop:
movlhps m2, m1
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m3
- palignr m4, m0, m1, 15
- palignr m5, m3, m2, 15
+ PALIGNR m4, m0, m1, 15, m6
+ PALIGNR m5, m3, m2, 15, m6
mova [dstq+strideq*2], m4
mova [dstq+stride3q ], m5
lea dstq, [dstq+strideq*4]
- palignr m0, m1, 14
- palignr m3, m2, 14
+ PALIGNR m0, m1, 14, m6
+ PALIGNR m3, m2, 14, m6
pslldq m1, 2
dec cntd
jg .loop
RET
-%if ARCH_X86_64
-INIT_XMM %1
cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
mova m0, [aq]
mova m2, [aq+16]
movu m1, [aq-1]
- palignr m3, m2, m0, 15
- palignr m4, m2, m0, 14
+ PALIGNR m3, m2, m0, 15, m6
+ PALIGNR m4, m2, m0, 14, m6
LOWPASS 4, 3, 2, 5
pavgb m3, m2
mova m2, [lq+16]
- palignr m5, m1, m2, 15
+ PALIGNR m5, m1, m2, 15, m6
LOWPASS 5, 1, 0, 6
pavgb m0, m1
mova m6, [lq]
- palignr m1, m2, 1
- palignr m7, m2, m6, 15
- LOWPASS 1, 2, 7, 8
- palignr m2, m6, 1
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [dstq], m0
+%endif
+ PALIGNR m1, m2, 1, m0
+ PALIGNR m7, m2, m6, 15, m0
+ LOWPASS 1, 2, 7, 0
+ PALIGNR m2, m6, 1, m0
pslldq m7, m6, 1
- LOWPASS 2, 6, 7, 8
+ LOWPASS 2, 6, 7, 0
+%if cpuflag(ssse3)
pshufb m1, [pb_02468ACE_13579BDF]
pshufb m2, [pb_02468ACE_13579BDF]
+%else
+ psrlw m0, m1, 8
+ psrlw m6, m2, 8
+ pand m1, [pw_255]
+ pand m2, [pw_255]
+ packuswb m1, m0
+ packuswb m2, m6
+%endif
DEFINE_ARGS dst, stride, dst16, cnt
lea dst16q, [dstq +strideq*8]
lea dst16q, [dst16q+strideq*8]
SBUTTERFLY qdq, 2, 1, 6
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova m0, [dstq]
+%endif
mov cntd, 8
.loop:
@@ -1216,8 +1616,8 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
movhps [dst16q+stride%1 ], %2
movu [dst16q+stride%1+ 8], %3
movq [dst16q+stride%1+24], %4
- palignr %4, %3, 15
- palignr %3, %2, 15
+ PALIGNR %4, %3, 15, m6
+ PALIGNR %3, %2, 15, m6
pslldq %2, 1
%endmacro
@@ -1228,15 +1628,18 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
dec cntd
jg .loop
RET
-%endif
%endmacro
-VR_XMM_FUNCS ssse3
-VR_XMM_FUNCS avx
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
; hd
-INIT_MMX ssse3
+INIT_MMX mmxext
cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
movd m0, [lq]
punpckldq m0, [aq-1]
@@ -1266,9 +1669,8 @@ cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
movd [dstq+strideq*0], m0
RET
-%macro HD_XMM_FUNCS 1
-INIT_XMM %1
-cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
movq m0, [lq]
movhps m0, [aq-1]
DEFINE_ARGS dst, stride, stride3, dst4
@@ -1296,18 +1698,17 @@ cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
movhps [dstq +stride3q ], m1
movq [dst4q+stride3q ], m1
- palignr m3, m2, m1, 2
+ PALIGNR m3, m2, m1, 2, m4
movhps [dstq +strideq*2], m3
movq [dst4q+strideq*2], m3
- palignr m3, m2, m1, 4
+ PALIGNR m3, m2, m1, 4, m4
movhps [dstq +strideq*1], m3
movq [dst4q+strideq*1], m3
- palignr m2, m1, 6
+ PALIGNR m2, m1, 6, m4
movhps [dstq +strideq*0], m2
movq [dst4q+strideq*0], m2
RET
-INIT_XMM %1
cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
mova m0, [lq]
movu m3, [aq-1]
@@ -1319,8 +1720,8 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
psrldq m4, m3, 1
psrldq m5, m3, 2
LOWPASS 5, 4, 3, 6
- palignr m1, m3, m0, 1
- palignr m2, m3, m0, 2
+ PALIGNR m1, m3, m0, 1, m6
+ PALIGNR m2, m3, m0, 2, m6
LOWPASS 2, 1, 0, 6
pavgb m1, m0
SBUTTERFLY bw, 1, 2, 6
@@ -1338,17 +1739,26 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
%if cpuflag(avx)
palignr m1, m2, m1, 2
palignr m2, m5, m2, 2
-%else
+%elif cpuflag(ssse3)
palignr m3, m2, m1, 2
palignr m0, m5, m2, 2
mova m1, m3
mova m2, m0
+%else
+ ; slightly modified version of PALIGNR
+ mova m6, m2
+ mova m4, m5
+ pslldq m6, 14
+ pslldq m4, 14
+ psrldq m1, 2
+ psrldq m2, 2
+ por m1, m6
+ por m2, m4
%endif
psrldq m5, 2
jg .loop
RET
-INIT_XMM %1
cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
mova m0, [lq]
mova m1, [lq+16]
@@ -1362,15 +1772,15 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
psrldq m4, m3, 1
psrldq m5, m3, 2
LOWPASS 5, 4, 3, 6
- palignr m4, m3, m2, 2
- palignr m3, m2, 1
+ PALIGNR m4, m3, m2, 2, m6
+ PALIGNR m3, m2, 1, m6
LOWPASS 4, 3, 2, 6
- palignr m3, m2, m1, 2
- palignr m2, m1, 1
+ PALIGNR m3, m2, m1, 2, m6
+ PALIGNR m2, m1, 1, m6
LOWPASS 3, 2, 1, 6
pavgb m2, m1
- palignr m6, m1, m0, 1
- palignr m1, m0, 2
+ PALIGNR m6, m1, m0, 1, m7
+ PALIGNR m1, m0, 2, m7
LOWPASS 1, 6, 0, 7
pavgb m0, m6
SBUTTERFLY bw, 2, 3, 6
@@ -1394,7 +1804,7 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
palignr m3, m4, m3, 2
palignr m4, m5, m4, 2
psrldq m5, 2
-%else
+%elif cpuflag(ssse3)
psrldq m6, m5, 2
palignr m5, m4, 2
palignr m4, m3, 2
@@ -1407,18 +1817,46 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
mova m3, m4
mova m4, m5
mova m5, m6
+%else
+ ; sort of a half-integrated version of PALIGNR
+ pslldq m7, m4, 14
+ pslldq m6, m5, 14
+ psrldq m4, 2
+ psrldq m5, 2
+ por m4, m6
+ pslldq m6, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ pslldq m7, m2, 14
+ psrldq m2, 2
+ por m2, m6
+ pslldq m6, m1, 14
+ psrldq m1, 2
+ por m1, m7
+ psrldq m0, 2
+ por m0, m6
%endif
jg .loop
RET
%endmacro
-HD_XMM_FUNCS ssse3
-HD_XMM_FUNCS avx
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
-INIT_MMX ssse3
+%macro HU_MMX_FUNCS 0
cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
movd m0, [lq]
+%if cpuflag(ssse3)
pshufb m0, [pb_0to2_5x3]
+%else
+ punpcklbw m1, m0, m0 ; 00112233
+ pshufw m1, m1, q3333 ; 33333333
+ punpckldq m0, m1 ; 01233333
+%endif
psrlq m1, m0, 8
psrlq m2, m1, 8
LOWPASS 2, 1, 0, 3
@@ -1426,7 +1864,7 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
SBUTTERFLY bw, 1, 2, 0
- palignr m2, m1, 2
+ PALIGNR m2, m1, 2, m0
movd [dstq+strideq*0], m1
movd [dstq+strideq*1], m2
punpckhdq m1, m1
@@ -1434,12 +1872,23 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
movd [dstq+strideq*2], m1
movd [dstq+stride3q ], m2
RET
+%endmacro
-%macro HU_XMM_FUNCS 1
-INIT_XMM %1
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
movq m0, [lq]
+%if cpuflag(ssse3)
pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4444555566667777
+ shufps m0, m1, q3310 ; 0123456777777777
+%endif
psrldq m1, m0, 1
psrldq m2, m1, 1
LOWPASS 2, 1, 0, 3
@@ -1450,56 +1899,81 @@ cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
SBUTTERFLY bw, 1, 2, 0
movq [dstq +strideq*0], m1
movhps [dst4q+strideq*0], m1
- palignr m0, m2, m1, 2
+ PALIGNR m0, m2, m1, 2, m3
movq [dstq +strideq*1], m0
movhps [dst4q+strideq*1], m0
- palignr m0, m2, m1, 4
+ PALIGNR m0, m2, m1, 4, m3
movq [dstq +strideq*2], m0
movhps [dst4q+strideq*2], m0
- palignr m2, m1, 6
+ PALIGNR m2, m1, 6, m3
movq [dstq +stride3q ], m2
movhps [dst4q+stride3q ], m2
RET
-INIT_XMM %1
cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
mova m0, [lq]
+%if cpuflag(ssse3)
mova m3, [pb_2toE_3xF]
pshufb m1, m0, [pb_1toE_2xF]
pshufb m2, m0, m3
+%else
+ pand m3, m0, [pb_15x0_1xm1]
+ psrldq m1, m0, 1
+ por m1, m3
+ punpckhbw m3, m3
+ psrldq m2, m0, 2
+ por m2, m3
+%endif
LOWPASS 2, 1, 0, 4
pavgb m1, m0
DEFINE_ARGS dst, stride, stride9, cnt
- lea stride9q, [strideq *3]
+ lea stride9q, [strideq*8+strideq]
mov cntd, 4
- lea stride9q, [stride9q*3]
SBUTTERFLY bw, 1, 2, 0
.loop:
mova [dstq+strideq*0], m1
mova [dstq+strideq*8], m2
- palignr m0, m2, m1, 2
+ PALIGNR m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
mova [dstq+strideq*1], m0
mova [dstq+stride9q ], m2
- palignr m1, m2, m0, 2
+ PALIGNR m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
-INIT_XMM %1
-cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
mova m1, [lq]
mova m0, [lq+16]
- mova m4, [pb_2toE_3xF]
- palignr m2, m0, m1, 1
- palignr m3, m0, m1, 2
+ PALIGNR m2, m0, m1, 1, m5
+ PALIGNR m3, m0, m1, 2, m5
LOWPASS 3, 2, 1, 5
pavgb m2, m1
- pshufb m1, m0, m4
+%if cpuflag(ssse3)
+ mova m4, [pb_2toE_3xF]
pshufb m5, m0, [pb_1toE_2xF]
+ pshufb m1, m0, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1]
+ psrldq m5, m0, 1
+ por m5, m4
+ punpckhbw m4, m4
+ psrldq m1, m0, 2
+ por m1, m4
+%endif
LOWPASS 1, 5, 0, 6
pavgb m0, m5
DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
@@ -1510,7 +1984,12 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
lea dst24q, [dst16q+strideq*8]
SBUTTERFLY bw, 0, 1, 5
SBUTTERFLY bw, 2, 3, 5
+%if cpuflag(ssse3)
pshufb m6, m1, [pb_15]
+%else
+ pshufhw m6, m4, q3333
+ punpckhqdq m6, m6
+%endif
.loop:
mova [dstq +stride0q+ 0], m2
@@ -1526,7 +2005,7 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
palignr m3, m0, m3, 2
palignr m0, m1, m0, 2
pshufb m1, m4
-%else
+%elif cpuflag(ssse3)
pshufb m5, m1, m4
palignr m1, m0, 2
palignr m0, m3, 2
@@ -1535,6 +2014,19 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
mova m3, m0
mova m0, m1
mova m1, m5
+%else
+ ; half-integrated version of PALIGNR
+ pslldq m5, m1, 14
+ pslldq m7, m0, 14
+ psrldq m1, 2
+ psrldq m0, 2
+ por m1, m4
+ por m0, m5
+ pslldq m5, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ psrldq m2, 2
+ por m2, m5
%endif
add stride0q, strideq
dec cntd
@@ -1542,7 +2034,11 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
RET
%endmacro
-HU_XMM_FUNCS ssse3
-HU_XMM_FUNCS avx
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
; FIXME 127, 128, 129 ?
More information about the ffmpeg-cvslog
mailing list