[FFmpeg-cvslog] avutil/pixelutils: faster pixelutils_sad_16x16

Clément Bœsch git at videolan.org
Sat Aug 23 20:14:17 CEST 2014


ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Sat Aug 23 20:03:10 2014 +0200| [554d8190624f25cefe079bd7b9ad61a2ade8541a] | committer: Clément Bœsch

avutil/pixelutils: faster pixelutils_sad_16x16

501 to 439 decicycles.

See 45c7f3997ea11c3d1007b2126b1c0049a8c27105.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=554d8190624f25cefe079bd7b9ad61a2ade8541a
---

 libavutil/x86/pixelutils.asm |   16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 15213d9..7522f24 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -109,18 +109,24 @@ cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
 ;-------------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
-    pxor        m4, m4
-%rep 8
-    movu        m0, [src1q]
+    movu        m4, [src1q]
+    movu        m2, [src2q]
     movu        m1, [src1q + stride1q]
+    movu        m3, [src2q + stride2q]
+    psadbw      m4, m2
+    psadbw      m1, m3
+    paddw       m4, m1
+%rep 7
+    lea         src1q, [src1q + 2*stride1q]
+    lea         src2q, [src2q + 2*stride2q]
+    movu        m0, [src1q]
     movu        m2, [src2q]
+    movu        m1, [src1q + stride1q]
     movu        m3, [src2q + stride2q]
     psadbw      m0, m2
     psadbw      m1, m3
     paddw       m4, m0
     paddw       m4, m1
-    lea         src1q, [src1q + 2*stride1q]
-    lea         src2q, [src2q + 2*stride2q]
 %endrep
     movhlps     m0, m4
     paddw       m4, m0



More information about the ffmpeg-cvslog mailing list