[FFmpeg-cvslog] x86/float_dsp: add ff_vector_fmul_reverse_avx2
James Almer
git at videolan.org
Wed Apr 12 03:36:21 EEST 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Apr 11 21:29:09 2017 -0300| [f1d80bc6305221506ad96f0ab82088f9229881ab] | committer: James Almer
x86/float_dsp: add ff_vector_fmul_reverse_avx2
~20% faster than AVX.
Signed-off-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f1d80bc6305221506ad96f0ab82088f9229881ab
---
libavutil/x86/float_dsp.asm | 15 ++++++++++++++-
libavutil/x86/float_dsp_init.c | 5 +++++
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 9affacb72b..edade0d55d 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -22,6 +22,9 @@
%include "x86util.asm"
+SECTION_RODATA 32
+pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -359,10 +362,16 @@ VECTOR_FMUL_ADD
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
+%if cpuflag(avx2)
+ mova m2, [pd_reverse]
+%endif
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
-%if cpuflag(avx)
+%if cpuflag(avx2)
+ vpermd m0, m2, [src1q]
+ vpermd m1, m2, [src1q+mmsize]
+%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123
@@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE
INIT_YMM avx
VECTOR_FMUL_REVERSE
%endif
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VECTOR_FMUL_REVERSE
+%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 09c7a4d3b2..122087a196 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len);
+void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
+ const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
@@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
+ }
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
More information about the ffmpeg-cvslog
mailing list