[FFmpeg-cvslog] dct32: Add SSE2 ASM optimizations
Vitor Sessak
git at videolan.org
Tue Aug 2 22:24:36 CEST 2011
ffmpeg | branch: master | Vitor Sessak <vitor1001 at gmail.com> | Sat Jul 30 18:39:25 2011 +0200| [18b131de0473a3110c63966cd7c6cd2ab118d401] | committer: Ronald S. Bultje
dct32: Add SSE2 ASM optimizations
Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=18b131de0473a3110c63966cd7c6cd2ab118d401
---
libavcodec/x86/dct32_sse.asm | 39 ++++++++++++++++++++++++++++++---------
libavcodec/x86/fft.c | 2 ++
libavcodec/x86/fft.h | 1 +
3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 46daa43..720a061 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -63,6 +63,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
mulps %1, %3
%endmacro
+%macro BUTTERFLY0_SSE2 5
+ pshufd %4, %1, %5
+ xorps %1, %2
+ addps %1, %4
+ mulps %1, %3
+%endmacro
+
%macro BUTTERFLY0_AVX 5
vshufps %4, %1, %1, %5
vxorps %1, %1, %2
@@ -405,18 +412,17 @@ INIT_XMM
INIT_XMM
+%macro DCT32_FUNC 1
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_sse, 2,3,16, out, in, tmp
+cglobal dct32_float_%1, 2,3,16, out, in, tmp
; pass 1
movaps m0, [inq+0]
- movaps m1, [inq+112]
- shufps m1, m1, 0x1b
+ LOAD_INV m1, [inq+112]
BUTTERFLY m0, m1, [ps_cos_vec], m3
movaps m7, [inq+64]
- movaps m4, [inq+48]
- shufps m4, m4, 0x1b
+ LOAD_INV m4, [inq+48]
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
; pass 2
@@ -427,13 +433,11 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
; pass 1
movaps m1, [inq+16]
- movaps m6, [inq+96]
- shufps m6, m6, 0x1b
+ LOAD_INV m6, [inq+96]
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
movaps m4, [inq+80]
- movaps m5, [inq+32]
- shufps m5, m5, 0x1b
+ LOAD_INV m5, [inq+32]
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
; pass 2
@@ -492,3 +496,20 @@ cglobal dct32_float_sse, 2,3,16, out, in, tmp
PASS5
PASS6
RET
+%endmacro
+
+%macro LOAD_INV_SSE 2
+ movaps %1, %2
+ shufps %1, %1, 0x1b
+%endmacro
+
+%define LOAD_INV LOAD_INV_SSE
+DCT32_FUNC sse
+
+%macro LOAD_INV_SSE2 2
+ pshufd %1, %2, 0x1b
+%endmacro
+
+%define LOAD_INV LOAD_INV_SSE2
+%define BUTTERFLY0 BUTTERFLY0_SSE2
+DCT32_FUNC sse2
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 899f0f7..f7308cc 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -60,6 +60,8 @@ av_cold void ff_dct_init_mmx(DCTContext *s)
int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
s->dct32 = ff_dct32_float_avx;
+ else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
+ s->dct32 = ff_dct32_float_sse2;
else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
s->dct32 = ff_dct32_float_sse;
#endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 0ade2b2..9d68d5b 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -35,6 +35,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
#endif /* AVCODEC_X86_FFT_H */
More information about the ffmpeg-cvslog
mailing list