[FFmpeg-devel] [PATCH 2/2] avfilter/x86/vf_ssim: add ff_ssim_4x4_line_xop
James Almer
jamrial at gmail.com
Mon Jul 20 05:53:37 CEST 2015
~20% faster than ssse3. Also enabled for x86_32
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavfilter/x86/vf_ssim.asm | 62 ++++++++++++++++++++++++++++++++++++++++--
libavfilter/x86/vf_ssim_init.c | 5 ++++
2 files changed, 64 insertions(+), 3 deletions(-)
diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm
index 6661987..3293e66 100644
--- a/libavfilter/x86/vf_ssim.asm
+++ b/libavfilter/x86/vf_ssim.asm
@@ -30,16 +30,50 @@ ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5)
SECTION .text
+%macro SSIM_4X4_LINE 1
%if ARCH_X86_64
-
-INIT_XMM ssse3
-cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+%else
+cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3
+%define wd r5mp
+%endif
lea ref_stride3q, [ref_strideq*3]
lea buf_stride3q, [buf_strideq*3]
+%if notcpuflag(xop)
pxor m7, m7
mova m15, [pw_1]
+%endif
.loop:
+%if cpuflag(xop)
+ pmovzxbw m0, [bufq+buf_strideq*0]
+ pmovzxbw m1, [refq+ref_strideq*0]
+ pmaddwd m4, m0, m0
+ pmaddwd m6, m0, m1
+ pmovzxbw m2, [bufq+buf_strideq*1]
+ vpmadcswd m4, m1, m1, m4
+ pmovzxbw m3, [refq+ref_strideq*1]
+ paddw m0, m2
+ vpmadcswd m4, m2, m2, m4
+ vpmadcswd m6, m2, m3, m6
+ paddw m1, m3
+ vpmadcswd m4, m3, m3, m4
+
+ pmovzxbw m2, [bufq+buf_strideq*2]
+ pmovzxbw m3, [refq+ref_strideq*2]
+ vpmadcswd m4, m2, m2, m4
+ vpmadcswd m6, m2, m3, m6
+ pmovzxbw m5, [bufq+buf_stride3q]
+ pmovzxbw m7, [refq+ref_stride3q]
+ vpmadcswd m4, m3, m3, m4
+ vpmadcswd m6, m5, m7, m6
+ paddw m0, m2
+ paddw m1, m3
+ vpmadcswd m4, m5, m5, m4
+ paddw m0, m5
+ paddw m1, m7
+ vpmadcswd m4, m7, m7, m4
+%else
movh m0, [bufq+buf_strideq*0] ; a1
movh m1, [refq+ref_strideq*0] ; b1
movh m2, [bufq+buf_strideq*1] ; a2
@@ -85,12 +119,25 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
paddd m4, m9
paddd m6, m14
paddd m4, m12
+%endif
; m0 = [word] s1 a,a,a,a,b,b,b,b
; m1 = [word] s2 a,a,a,a,b,b,b,b
; m4 = [dword] ss a,a,b,b
; m6 = [dword] s12 a,a,b,b
+%if cpuflag(xop)
+ vphaddwq m0, m0 ; [dword] s1 a, 0, b, 0
+ vphaddwq m1, m1 ; [dword] s2 a, 0, b, 0
+ vphadddq m4, m4 ; [dword] ss a, 0, b, 0
+ vphadddq m6, m6 ; [dword] s12 a, 0, b, 0
+ punpckhdq m2, m0, m1 ; [dword] s1 b, s2 b, 0, 0
+ punpckldq m0, m1 ; [dword] s1 a, s2 a, 0, 0
+ punpckhdq m3, m4, m6 ; [dword] ss b, s12 b, 0, 0
+ punpckldq m4, m6 ; [dword] ss a, s12 a, 0, 0
+ punpcklqdq m1, m2, m3 ; [dword] b s1, s2, ss, s12
+ punpcklqdq m0, m4 ; [dword] a s1, s2, ss, s12
+%else
pmaddwd m0, m15 ; [dword] s1 a,a,b,b
pmaddwd m1, m15 ; [dword] s2 a,a,b,b
phaddd m0, m4 ; [dword] s1 a, b, ss a, b
@@ -99,6 +146,7 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
punpckldq m0, m1 ; [dword] s1 a, s2 a, s1 b, s2 b
punpckhqdq m1, m0, m2 ; [dword] b s1, s2, ss, s12
punpcklqdq m0, m2 ; [dword] a s1, s2, ss, s12
+%endif
mova [sumsq+ 0], m0
mova [sumsq+mmsize], m1
@@ -109,7 +157,15 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
sub wd, mmsize/8
jg .loop
RET
+%endmacro
+%if ARCH_X86_64
+INIT_XMM ssse3
+SSIM_4X4_LINE 16
+%endif
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+SSIM_4X4_LINE 8
%endif
INIT_XMM sse4
diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c
index 9514b25..599c928 100644
--- a/libavfilter/x86/vf_ssim_init.c
+++ b/libavfilter/x86/vf_ssim_init.c
@@ -25,6 +25,9 @@
void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *ref, ptrdiff_t ref_stride,
int (*sums)[4], int w);
+void ff_ssim_4x4_line_xop (const uint8_t *buf, ptrdiff_t buf_stride,
+ const uint8_t *ref, ptrdiff_t ref_stride,
+ int (*sums)[4], int w);
float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
void ff_ssim_init_x86(SSIMDSPContext *dsp)
@@ -35,4 +38,6 @@ void ff_ssim_init_x86(SSIMDSPContext *dsp)
dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
dsp->ssim_end_line = ff_ssim_end_line_sse4;
+ if (EXTERNAL_XOP(cpu_flags))
+ dsp->ssim_4x4_line = ff_ssim_4x4_line_xop;
}
--
2.4.5
More information about the ffmpeg-devel
mailing list