[FFmpeg-devel] [PATCH] vp9/x86: add ff_vp9_loop_filter_[vh]_16_16_sse2().
James Almer
jamrial at gmail.com
Fri Jan 17 07:40:36 CET 2014
Similar gains in performance as the SSSE3 version
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/vp9dsp_init.c | 19 +++++++++++++++----
libavcodec/x86/vp9lpf.asm | 10 ++++------
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 900efb3..ab3396e 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -177,10 +177,17 @@ itxfm_func(idct, idct, 32, avx);
#undef itxfm_func
#undef itxfm_funcs
-void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
-void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
-void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
-void ff_vp9_loop_filter_h_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H)
+
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+
+#undef lpf_funcs
#endif /* HAVE_YASM */
@@ -230,6 +237,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2);
+ if (ARCH_X86_64) {
+ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
+ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
+ }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index c5e5df9..60caf73 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -285,10 +285,8 @@ SECTION .text
; calc fm mask
pxor m0, m0
- movd m2, Id
- movd m3, Ed
- pshufb m2, m0 ; I I I I ...
- pshufb m3, m0 ; E E E E ...
+ SPLATB_REG m2, I, m0 ; I I I I ...
+ SPLATB_REG m3, E, m0 ; E E E E ...
mova m0, [pb_80]
pxor m2, m0
pxor m3, m0
@@ -341,8 +339,7 @@ SECTION .text
pand m2, m1
ABSSUB m4, m10, m11, m5 ; abs(p1 - p0)
pxor m0, m0
- movd m7, Hd
- pshufb m7, m0 ; H H H H ...
+ SPLATB_REG m7, H, m0 ; H H H H ...
pxor m7, m8
pxor m4, m8
pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
@@ -665,6 +662,7 @@ cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst
RET
%endmacro
+LPF_16_16_VH sse2
LPF_16_16_VH ssse3
LPF_16_16_VH avx
--
1.8.3.2
More information about the ffmpeg-devel
mailing list