[FFmpeg-devel] [PATCH] vp9/x86: 16px MC functions (64bit only).
Ronald S. Bultje
rsbultje at gmail.com
Fri Dec 27 03:05:37 CET 2013
Cycle counts for large MCs (old -> new on ped1080p.webm, mx!=0&&my!=0):
16x8: 876 -> 870 (0.7%)
16x16: 1444 -> 1435 (0.7%)
16x32: 2784 -> 2748 (1.3%)
32x16: 2455 -> 2349 (4.5%)
32x32: 4641 -> 4084 (13.6%)
32x64: 9200 -> 7834 (17.4%)
64x32: 8980 -> 7197 (24.8%)
64x64: 17330 -> 13796 (25.6%)
Total decoding time goes from 9.326sec to 9.182sec.
---
libavcodec/x86/vp9dsp_init.c | 5 ++
libavcodec/x86/vp9mc.asm | 122 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 127 insertions(+)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 5c31db6..62264bf 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -56,6 +56,9 @@ mc_func(avg, sz, v, ssse3)
mc_funcs(4);
mc_funcs(8);
+#if ARCH_X86_64
+mc_funcs(16);
+#endif
#undef mc_funcs
#undef mc_func
@@ -78,7 +81,9 @@ mc_rep_func(avg, sz, hsz, h, ssse3); \
mc_rep_func(put, sz, hsz, v, ssse3); \
mc_rep_func(avg, sz, hsz, v, ssse3)
+#if ARCH_X86_32
mc_rep_funcs(16, 8);
+#endif
mc_rep_funcs(32, 16);
mc_rep_funcs(64, 32);
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index 21c38b4..a7568f3 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -145,6 +145,62 @@ INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+ mova m13, [pw_256]
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ movu m0, [srcq-3]
+ movu m1, [srcq-2]
+ movu m2, [srcq-1]
+ movu m3, [srcq+0]
+ movu m4, [srcq+1]
+ movu m5, [srcq+2]
+ movu m6, [srcq+3]
+ movu m7, [srcq+4]
+ add srcq, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
@@ -220,6 +276,72 @@ INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+ sub srcq, sstrideq
+ lea sstride3q, [sstrideq*3]
+ sub srcq, sstrideq
+ mova m13, [pw_256]
+ sub srcq, sstrideq
+ mova m8, [filteryq+ 0]
+ lea src4q, [srcq+sstrideq*4]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ movu m4, [src4q]
+ movu m5, [src4q+sstrideq]
+ movu m6, [src4q+sstrideq*2]
+ movu m7, [src4q+sstride3q]
+ add srcq, sstrideq
+ add src4q, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro fpel_fn 6
%if %2 == 4
%define %%srcfn movh
--
1.8.4
More information about the ffmpeg-devel
mailing list