[FFmpeg-cvslog] vp9lpf/x86: add an SSE2 version of vp9_loop_filter_[vh]_88_16
James Almer
git at videolan.org
Thu Mar 23 12:44:15 EET 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Jan 28 04:59:45 2014 -0300| [92d47550ea099fde8c6f4443c94ec768e19ffd26] | committer: Anton Khirnov
vp9lpf/x86: add an SSE2 version of vp9_loop_filter_[vh]_88_16
Similar gains as the ssse3 version once again
Additional improvements by Clément Bœsch <u at pkh.me>.
Signed-off-by: James Almer <jamrial at gmail.com>
Signed-off-by: Anton Khirnov <anton at khirnov.net>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=92d47550ea099fde8c6f4443c94ec768e19ffd26
---
libavcodec/x86/vp9dsp_init.c | 3 +++
libavcodec/x86/vp9lpf.asm | 20 +++++++++++++++++---
2 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 00a5798..37d53d2 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -226,6 +226,7 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx);
+lpf_funcs(88, 16, sse2);
lpf_funcs(88, 16, ssse3);
lpf_funcs(88, 16, avx);
@@ -293,6 +294,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2);
if (ARCH_X86_64) {
+ dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
+ dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
}
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 183f3f6..bde3fcb 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -292,6 +292,17 @@ SECTION .text
%define Q7 dst2q + strideq
%endmacro
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+ pshufb %1, %2
+%else
+ punpcklbw %1, %1
+ punpcklwd %1, %1
+ punpckldq %1, %1
+%endif
+%endmacro
+
%macro LOOPFILTER 2 ; %1=v/h %2=size1
lea mstrideq, [strideq]
neg mstrideq
@@ -382,11 +393,13 @@ SECTION .text
SPLATB_REG m2, I, m0 ; I I I I ...
SPLATB_REG m3, E, m0 ; E E E E ...
%elif %2 == 88
+%if cpuflag(ssse3)
mova m0, [mask_mix]
+%endif
movd m2, Id
movd m3, Ed
- pshufb m2, m0
- pshufb m3, m0
+ SPLATB_MIX m2, m0
+ SPLATB_MIX m3, m0
%endif
mova m0, [pb_80]
pxor m2, m0
@@ -446,7 +459,7 @@ SECTION .text
SPLATB_REG m7, H, m0 ; H H H H ...
%else
movd m7, Hd
- pshufb m7, [mask_mix]
+ SPLATB_MIX m7
%endif
pxor m7, m8
pxor m4, m8
@@ -727,6 +740,7 @@ LPF_16_16_VH sse2
LPF_16_16_VH ssse3
LPF_16_16_VH avx
+LPF_88_16_VH sse2
LPF_88_16_VH ssse3
LPF_88_16_VH avx
More information about the ffmpeg-cvslog
mailing list