[FFmpeg-cvslog] x86/vp9lpf: add ff_vp9_loop_filter_[vh]_88_16_sse2()

James Almer git at videolan.org
Tue Jan 28 10:03:34 CET 2014


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Jan 28 04:59:45 2014 -0300| [644c32ea4b8092e2bb19083df1f3d7ea9f277b78] | committer: Clément Bœsch

x86/vp9lpf: add ff_vp9_loop_filter_[vh]_88_16_sse2()

Similar gains as the ssse3 version once again

Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=644c32ea4b8092e2bb19083df1f3d7ea9f277b78
---

 libavcodec/x86/vp9dsp_init.c |    3 +++
 libavcodec/x86/vp9lpf.asm    |   20 +++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index a6ea075..ced23ce 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -187,6 +187,7 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
 lpf_funcs(16, 16, sse2);
 lpf_funcs(16, 16, ssse3);
 lpf_funcs(16, 16, avx);
+lpf_funcs(88, 16, sse2);
 lpf_funcs(88, 16, ssse3);
 lpf_funcs(88, 16, avx);
 
@@ -248,6 +249,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
         init_fpel(1, 1, 32, avg, sse2);
         init_fpel(0, 1, 64, avg, sse2);
         if (ARCH_X86_64) {
+            dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
+            dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
             dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
             dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
         }
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index b374884..d187b28 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -304,6 +304,17 @@ SECTION .text
 %define Q7 dst2q +  strideq
 %endmacro
 
+%macro SPLATB_MASK 2
+%if cpuflag(ssse3)
+    pshufb     %1, %2
+%else
+    punpcklbw  %1, %1
+    punpcklqdq %1, %1
+    pshuflw    %1, %1, 0
+    pshufhw    %1, %1, 0x55
+%endif
+%endmacro
+
 %macro LOOPFILTER 2 ; %1=v/h %2=size1
     lea mstrideq, [strideq]
     neg mstrideq
@@ -394,11 +405,13 @@ SECTION .text
     SPLATB_REG          m2, I, m0                       ; I I I I ...
     SPLATB_REG          m3, E, m0                       ; E E E E ...
 %elif %2 == 88
+%if cpuflag(ssse3)
     mova                m0, [mask_mix]
+%endif
     movd                m2, Id
     movd                m3, Ed
-    pshufb              m2, m0
-    pshufb              m3, m0
+    SPLATB_MASK         m2, m0
+    SPLATB_MASK         m3, m0
 %endif
     mova                m0, [pb_80]
     pxor                m2, m0
@@ -456,7 +469,7 @@ SECTION .text
     SPLATB_REG          m7, H, m0                       ; H H H H ...
 %else
     movd                m7, Hd
-    pshufb              m7, [mask_mix]
+    SPLATB_MASK         m7, [mask_mix]
 %endif
     pxor                m7, m8
     pxor                m4, m8
@@ -760,6 +773,7 @@ LPF_16_16_VH sse2
 LPF_16_16_VH ssse3
 LPF_16_16_VH avx
 
+LPF_88_16_VH sse2
 LPF_88_16_VH ssse3
 LPF_88_16_VH avx
 



More information about the ffmpeg-cvslog mailing list