[FFmpeg-devel] [PATCH 1/2] x86/vf_stereo3d: optimize register usage
James Almer
jamrial at gmail.com
Sun Dec 27 23:00:36 CET 2015
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavfilter/x86/vf_stereo3d.asm | 164 +++++++++++++++++++++-------------------
1 file changed, 86 insertions(+), 78 deletions(-)
diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm
index 94a0473..29a8c56 100644
--- a/libavfilter/x86/vf_stereo3d.asm
+++ b/libavfilter/x86/vf_stereo3d.asm
@@ -37,125 +37,133 @@ ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
SECTION .text
INIT_XMM sse4
-cglobal anaglyph, 11, 13, 16, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b
- movu m13, [ana_matrix_rq+ 0]
- movq m15, [ana_matrix_rq+16]
- pshufd m10, m13, q0000
- pshufd m11, m13, q1111
- pshufd m12, m13, q2222
- pshufd m13, m13, q3333
- pshufd m14, m15, q0000
- pshufd m15, m15, q1111
- mova [rsp+mmsize*0], m10
- mova [rsp+mmsize*1], m11
- mova [rsp+mmsize*2], m12
- mova [rsp+mmsize*3], m13
- mova [rsp+mmsize*4], m14
- mova [rsp+mmsize*5], m15
-
- movu m13, [ana_matrix_gq+ 0]
- movq m15, [ana_matrix_gq+16]
- pshufd m10, m13, q0000
- pshufd m11, m13, q1111
- pshufd m12, m13, q2222
- pshufd m13, m13, q3333
- pshufd m14, m15, q0000
- pshufd m15, m15, q1111
- mova [rsp+mmsize*6 ], m10
- mova [rsp+mmsize*7 ], m11
- mova [rsp+mmsize*8 ], m12
- mova [rsp+mmsize*9 ], m13
- mova [rsp+mmsize*10], m14
- mova [rsp+mmsize*11], m15
-
- movu m13, [ana_matrix_bq+ 0]
- movq m15, [ana_matrix_bq+16]
- pshufd m10, m13, q0000
- pshufd m11, m13, q1111
- pshufd m12, m13, q2222
- pshufd m13, m13, q3333
- pshufd m14, m15, q0000
- pshufd m15, m15, q1111
+cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt
+%define ana_matrix_rq r6q
+%define ana_matrix_gq r7q
+%define ana_matrix_bq r8q
+ mov ana_matrix_rq, r8m
+ mov ana_matrix_gq, r9m
+ mov ana_matrix_bq, r10m
+ movu m3, [ana_matrix_rq+ 0]
+ movq m5, [ana_matrix_rq+16]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ mova [rsp+mmsize*0], m0
+ mova [rsp+mmsize*1], m1
+ mova [rsp+mmsize*2], m2
+ mova [rsp+mmsize*3], m3
+ mova [rsp+mmsize*4], m4
+ mova [rsp+mmsize*5], m5
+
+ movu m3, [ana_matrix_gq+ 0]
+ movq m5, [ana_matrix_gq+16]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ mova [rsp+mmsize*6 ], m0
+ mova [rsp+mmsize*7 ], m1
+ mova [rsp+mmsize*8 ], m2
+ mova [rsp+mmsize*9 ], m3
+ mova [rsp+mmsize*10], m4
+ mova [rsp+mmsize*11], m5
+
+ movu m11, [ana_matrix_bq+ 0]
+ movq m13, [ana_matrix_bq+16]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pshufd m12, m13, q0000
+ pshufd m13, m13, q1111
+ mov widthd, dword widthm
+ mov heightd, dword heightm
+
.nextrow:
- mov r11q, widthq
- mov r12q, 0
- %define o r12q
+ mov od, widthd
+ xor cntd, cntd
.loop:
- movu m0, [lsrcq+o+0]
+ movu m0, [lsrcq+cntq]
pshufb m1, m0, [ex_r]
pshufb m2, m0, [ex_g]
pshufb m3, m0, [ex_b]
- movu m0, [rsrcq+o+0]
+ movu m0, [rsrcq+cntq]
pshufb m4, m0, [ex_r]
pshufb m5, m0, [ex_g]
- pshufb m6, m0, [ex_b]
+ pshufb m0, [ex_b]
pmulld m1, [rsp+mmsize*0]
pmulld m2, [rsp+mmsize*1]
pmulld m3, [rsp+mmsize*2]
pmulld m4, [rsp+mmsize*3]
pmulld m5, [rsp+mmsize*4]
- pmulld m6, [rsp+mmsize*5]
+ pmulld m0, [rsp+mmsize*5]
paddd m1, m2
paddd m3, m4
- paddd m5, m6
+ paddd m5, m0
paddd m1, m3
paddd m1, m5
- movu m0, [lsrcq+o+0]
+ movu m0, [lsrcq+cntq]
pshufb m7, m0, [ex_r]
pshufb m2, m0, [ex_g]
pshufb m3, m0, [ex_b]
- movu m0, [rsrcq+o+0]
+ movu m0, [rsrcq+cntq]
pshufb m4, m0, [ex_r]
pshufb m5, m0, [ex_g]
- pshufb m6, m0, [ex_b]
+ pshufb m0, [ex_b]
pmulld m7, [rsp+mmsize*6]
pmulld m2, [rsp+mmsize*7]
pmulld m3, [rsp+mmsize*8]
pmulld m4, [rsp+mmsize*9]
pmulld m5, [rsp+mmsize*10]
- pmulld m6, [rsp+mmsize*11]
+ pmulld m0, [rsp+mmsize*11]
paddd m7, m2
paddd m3, m4
- paddd m5, m6
+ paddd m5, m0
paddd m7, m3
paddd m7, m5
- movu m0, [lsrcq+o+0]
- pshufb m8, m0, [ex_r]
- pshufb m2, m0, [ex_g]
- pshufb m3, m0, [ex_b]
- movu m0, [rsrcq+o+0]
- pshufb m4, m0, [ex_r]
- pshufb m5, m0, [ex_g]
- pshufb m6, m0, [ex_b]
- pmulld m8, m10
- pmulld m2, m11
- pmulld m3, m12
- pmulld m4, m13
- pmulld m5, m14
- pmulld m6, m15
- paddd m8, m2
- paddd m3, m4
- paddd m5, m6
- paddd m8, m3
- paddd m8, m5
+ movu m0, [lsrcq+cntq]
+ pshufb m2, m0, [ex_r]
+ pshufb m3, m0, [ex_g]
+ pshufb m4, m0, [ex_b]
+ movu m0, [rsrcq+cntq]
+ pshufb m5, m0, [ex_r]
+ pshufb m6, m0, [ex_g]
+ pshufb m0, [ex_b]
+ pmulld m2, m8
+ pmulld m3, m9
+ pmulld m4, m10
+ pmulld m5, m11
+ pmulld m6, m12
+ pmulld m0, m13
+ paddd m2, m3
+ paddd m4, m5
+ paddd m6, m0
+ paddd m2, m4
+ paddd m2, m6
psrld m1, 16
psrld m7, 16
- psrld m8, 16
+ psrld m2, 16
packusdw m1, m7
- packusdw m8, m8
- packuswb m1, m8
+ packusdw m2, m2
+ packuswb m1, m2
pshufb m1, [shuf]
- movq [dstq+o+0], m1
+ movq [dstq+cntq+0], m1
psrldq m1, 8
- movd [dstq+o+8], m1
- add r12d, 12
- sub r11d, 4
+ movd [dstq+cntq+8], m1
+ add cntd, 12
+ sub od, 4
jg .loop
add dstq, dst_linesizeq
--
2.6.3
More information about the ffmpeg-devel
mailing list