[FFmpeg-devel] [PATCH 1/2] x86/vf_stereo3d: optimize register usage

James Almer jamrial at gmail.com
Sun Dec 27 23:00:36 CET 2015


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavfilter/x86/vf_stereo3d.asm | 164 +++++++++++++++++++++-------------------
 1 file changed, 86 insertions(+), 78 deletions(-)

diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm
index 94a0473..29a8c56 100644
--- a/libavfilter/x86/vf_stereo3d.asm
+++ b/libavfilter/x86/vf_stereo3d.asm
@@ -37,125 +37,133 @@ ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
 SECTION .text
 
 INIT_XMM sse4
-cglobal anaglyph, 11, 13, 16, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b
-    movu                 m13, [ana_matrix_rq+ 0]
-    movq                 m15, [ana_matrix_rq+16]
-    pshufd               m10, m13, q0000
-    pshufd               m11, m13, q1111
-    pshufd               m12, m13, q2222
-    pshufd               m13, m13, q3333
-    pshufd               m14, m15, q0000
-    pshufd               m15, m15, q1111
-    mova      [rsp+mmsize*0], m10
-    mova      [rsp+mmsize*1], m11
-    mova      [rsp+mmsize*2], m12
-    mova      [rsp+mmsize*3], m13
-    mova      [rsp+mmsize*4], m14
-    mova      [rsp+mmsize*5], m15
-
-    movu                 m13, [ana_matrix_gq+ 0]
-    movq                 m15, [ana_matrix_gq+16]
-    pshufd               m10, m13, q0000
-    pshufd               m11, m13, q1111
-    pshufd               m12, m13, q2222
-    pshufd               m13, m13, q3333
-    pshufd               m14, m15, q0000
-    pshufd               m15, m15, q1111
-    mova     [rsp+mmsize*6 ], m10
-    mova     [rsp+mmsize*7 ], m11
-    mova     [rsp+mmsize*8 ], m12
-    mova     [rsp+mmsize*9 ], m13
-    mova     [rsp+mmsize*10], m14
-    mova     [rsp+mmsize*11], m15
-
-    movu                 m13, [ana_matrix_bq+ 0]
-    movq                 m15, [ana_matrix_bq+16]
-    pshufd               m10, m13, q0000
-    pshufd               m11, m13, q1111
-    pshufd               m12, m13, q2222
-    pshufd               m13, m13, q3333
-    pshufd               m14, m15, q0000
-    pshufd               m15, m15, q1111
+cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt
+%define ana_matrix_rq r6q
+%define ana_matrix_gq r7q
+%define ana_matrix_bq r8q
+    mov        ana_matrix_rq, r8m
+    mov        ana_matrix_gq, r9m
+    mov        ana_matrix_bq, r10m
+    movu                  m3, [ana_matrix_rq+ 0]
+    movq                  m5, [ana_matrix_rq+16]
+    pshufd                m0, m3, q0000
+    pshufd                m1, m3, q1111
+    pshufd                m2, m3, q2222
+    pshufd                m3, m3, q3333
+    pshufd                m4, m5, q0000
+    pshufd                m5, m5, q1111
+    mova      [rsp+mmsize*0], m0
+    mova      [rsp+mmsize*1], m1
+    mova      [rsp+mmsize*2], m2
+    mova      [rsp+mmsize*3], m3
+    mova      [rsp+mmsize*4], m4
+    mova      [rsp+mmsize*5], m5
+
+    movu                  m3, [ana_matrix_gq+ 0]
+    movq                  m5, [ana_matrix_gq+16]
+    pshufd                m0, m3, q0000
+    pshufd                m1, m3, q1111
+    pshufd                m2, m3, q2222
+    pshufd                m3, m3, q3333
+    pshufd                m4, m5, q0000
+    pshufd                m5, m5, q1111
+    mova     [rsp+mmsize*6 ], m0
+    mova     [rsp+mmsize*7 ], m1
+    mova     [rsp+mmsize*8 ], m2
+    mova     [rsp+mmsize*9 ], m3
+    mova     [rsp+mmsize*10], m4
+    mova     [rsp+mmsize*11], m5
+
+    movu                 m11, [ana_matrix_bq+ 0]
+    movq                 m13, [ana_matrix_bq+16]
+    pshufd                m8, m11, q0000
+    pshufd                m9, m11, q1111
+    pshufd               m10, m11, q2222
+    pshufd               m11, m11, q3333
+    pshufd               m12, m13, q0000
+    pshufd               m13, m13, q1111
+    mov               widthd, dword widthm
+    mov              heightd, dword heightm
+
 .nextrow:
-    mov       r11q, widthq
-    mov       r12q, 0
-    %define      o  r12q
+    mov                   od, widthd
+    xor                 cntd, cntd
 
     .loop:
-        movu                 m0, [lsrcq+o+0]
+        movu                 m0, [lsrcq+cntq]
         pshufb               m1, m0, [ex_r]
         pshufb               m2, m0, [ex_g]
         pshufb               m3, m0, [ex_b]
-        movu                 m0, [rsrcq+o+0]
+        movu                 m0, [rsrcq+cntq]
         pshufb               m4, m0, [ex_r]
         pshufb               m5, m0, [ex_g]
-        pshufb               m6, m0, [ex_b]
+        pshufb               m0, [ex_b]
         pmulld               m1, [rsp+mmsize*0]
         pmulld               m2, [rsp+mmsize*1]
         pmulld               m3, [rsp+mmsize*2]
         pmulld               m4, [rsp+mmsize*3]
         pmulld               m5, [rsp+mmsize*4]
-        pmulld               m6, [rsp+mmsize*5]
+        pmulld               m0, [rsp+mmsize*5]
         paddd                m1, m2
         paddd                m3, m4
-        paddd                m5, m6
+        paddd                m5, m0
         paddd                m1, m3
         paddd                m1, m5
 
-        movu                 m0, [lsrcq+o+0]
+        movu                 m0, [lsrcq+cntq]
         pshufb               m7, m0, [ex_r]
         pshufb               m2, m0, [ex_g]
         pshufb               m3, m0, [ex_b]
-        movu                 m0, [rsrcq+o+0]
+        movu                 m0, [rsrcq+cntq]
         pshufb               m4, m0, [ex_r]
         pshufb               m5, m0, [ex_g]
-        pshufb               m6, m0, [ex_b]
+        pshufb               m0, [ex_b]
         pmulld               m7, [rsp+mmsize*6]
         pmulld               m2, [rsp+mmsize*7]
         pmulld               m3, [rsp+mmsize*8]
         pmulld               m4, [rsp+mmsize*9]
         pmulld               m5, [rsp+mmsize*10]
-        pmulld               m6, [rsp+mmsize*11]
+        pmulld               m0, [rsp+mmsize*11]
         paddd                m7, m2
         paddd                m3, m4
-        paddd                m5, m6
+        paddd                m5, m0
         paddd                m7, m3
         paddd                m7, m5
 
-        movu                 m0, [lsrcq+o+0]
-        pshufb               m8, m0, [ex_r]
-        pshufb               m2, m0, [ex_g]
-        pshufb               m3, m0, [ex_b]
-        movu                 m0, [rsrcq+o+0]
-        pshufb               m4, m0, [ex_r]
-        pshufb               m5, m0, [ex_g]
-        pshufb               m6, m0, [ex_b]
-        pmulld               m8, m10
-        pmulld               m2, m11
-        pmulld               m3, m12
-        pmulld               m4, m13
-        pmulld               m5, m14
-        pmulld               m6, m15
-        paddd                m8, m2
-        paddd                m3, m4
-        paddd                m5, m6
-        paddd                m8, m3
-        paddd                m8, m5
+        movu                 m0, [lsrcq+cntq]
+        pshufb               m2, m0, [ex_r]
+        pshufb               m3, m0, [ex_g]
+        pshufb               m4, m0, [ex_b]
+        movu                 m0, [rsrcq+cntq]
+        pshufb               m5, m0, [ex_r]
+        pshufb               m6, m0, [ex_g]
+        pshufb               m0, [ex_b]
+        pmulld               m2, m8
+        pmulld               m3, m9
+        pmulld               m4, m10
+        pmulld               m5, m11
+        pmulld               m6, m12
+        pmulld               m0, m13
+        paddd                m2, m3
+        paddd                m4, m5
+        paddd                m6, m0
+        paddd                m2, m4
+        paddd                m2, m6
 
         psrld                m1, 16
         psrld                m7, 16
-        psrld                m8, 16
+        psrld                m2, 16
 
         packusdw             m1, m7
-        packusdw             m8, m8
-        packuswb             m1, m8
+        packusdw             m2, m2
+        packuswb             m1, m2
         pshufb               m1, [shuf]
 
-        movq         [dstq+o+0], m1
+        movq      [dstq+cntq+0], m1
         psrldq               m1, 8
-        movd         [dstq+o+8], m1
-        add                r12d, 12
-        sub                r11d, 4
+        movd      [dstq+cntq+8], m1
+        add                cntd, 12
+        sub                  od, 4
     jg .loop
 
     add          dstq, dst_linesizeq
-- 
2.6.3



More information about the ffmpeg-devel mailing list