[FFmpeg-cvslog] x86/aacpsdsp: optimize ff_ps_mul_pair_single_sse
James Almer
git at videolan.org
Mon Jun 5 05:34:52 EEST 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Sun Jun 4 23:29:56 2017 -0300| [933dd62288ba9e73145932f229f355c985862641] | committer: James Almer
x86/aacpsdsp: optimize ff_ps_mul_pair_single_sse
~2% faster.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=933dd62288ba9e73145932f229f355c985862641
---
libavcodec/x86/aacpsdsp.asm | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index 4548bb4257..22a03f4f76 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -62,24 +62,27 @@ PS_ADD_SQUARES 3
; float *src1, int n);
;*******************************************************************
INIT_XMM sse
-cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
- xor r4q, r4q
+cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
+ shl nd, 3
+ add src1q, nq
+ add dstq, nq
+ neg nq
+align 16
.loop:
- movu m0, [src1q+r4q]
- movu m1, [src1q+r4q+mmsize]
+ movu m0, [src1q+nq]
+ movu m1, [src1q+nq+mmsize]
mova m2, [src2q]
mova m3, m2
unpcklps m2, m2
unpckhps m3, m3
mulps m0, m2
mulps m1, m3
- mova [dstq+r4q], m0
- mova [dstq+r4q+mmsize], m1
+ mova [dstq+nq], m0
+ mova [dstq+nq+mmsize], m1
add src2q, mmsize
- add r4q, mmsize*2
- sub nd, mmsize/4
- jg .loop
+ add nq, mmsize*2
+ jl .loop
REP_RET
;***********************************************************************
More information about the ffmpeg-cvslog
mailing list