[FFmpeg-cvslog] x86: fmtconvert: add special asm for float_to_int16_interleave_misc_*
Ronald S. Bultje
git at videolan.org
Sat Jun 30 22:47:43 CEST 2012
ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Thu Jun 14 15:03:08 2012 +0100| [66a02159ea9a09965dfa3e06ea55f41e5f615f90] | committer: Martin Storsjö
x86: fmtconvert: add special asm for float_to_int16_interleave_misc_*
This gets rid of a variable-length array and a for loop in C code.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=66a02159ea9a09965dfa3e06ea55f41e5f615f90
---
libavcodec/x86/fmtconvert.asm | 78 +++++++++++++++++++++++++++++++++++++++
libavcodec/x86/fmtconvert_mmx.c | 12 +++---
2 files changed, 85 insertions(+), 5 deletions(-)
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 63befc9..4916e7a 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0
FLOAT_TO_INT16 3dnow, 0
%undef cvtps2pi
+;------------------------------------------------------------------------------
+; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_STEP 2
+cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
+ add lenq, lenq
+ lea srcq, [srcq+2*lenq]
+ lea step3q, [stepq*3]
+ neg lenq
+.loop:
+%ifidn %1, sse2
+ cvtps2dq m0, [srcq+2*lenq ]
+ cvtps2dq m1, [srcq+2*lenq+16]
+ packssdw m0, m1
+ movd v1d, m0
+ psrldq m0, 4
+ movd v2d, m0
+ psrldq m0, 4
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+ movd v1d, m0
+ psrldq m0, 4
+ movd v2d, m0
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+%else
+ cvtps2pi m0, [srcq+2*lenq ]
+ cvtps2pi m1, [srcq+2*lenq+ 8]
+ cvtps2pi m2, [srcq+2*lenq+16]
+ cvtps2pi m3, [srcq+2*lenq+24]
+ packssdw m0, m1
+ packssdw m2, m3
+ movd v1d, m0
+ psrlq m0, 32
+ movd v2d, m0
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+ movd v1d, m2
+ psrlq m2, 32
+ movd v2d, m2
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+%endif
+ add lenq, 16
+ js .loop
+%ifnidn %1, sse2
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16_STEP sse2, 2
+INIT_MMX
+FLOAT_TO_INT16_STEP sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_STEP 3dnow, 0
+%undef cvtps2pi
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 42cb0bc..aaf634d 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -25,6 +25,7 @@
#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h"
+#include "libavcodec/dsputil.h"
#if HAVE_YASM
@@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
+void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
+
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
@@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\
- int i,j,c;\
+ int c;\
for(c=0; c<channels; c++){\
- ff_float_to_int16_##cpu(tmp, src[c], len);\
- for(i=0, j=c; i<len; i++, j+=channels)\
- dst[j] = tmp[i];\
+ ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
}\
}\
\
More information about the ffmpeg-cvslog
mailing list