[FFmpeg-devel] [PATCH 3/4] swr: add ff_resample_{common, linear}_float_fma
James Almer
jamrial at gmail.com
Mon Jun 30 01:19:05 CEST 2014
Signed-off-by: James Almer <jamrial at gmail.com>
---
libswresample/x86/resample.asm | 25 +++++++++++++++++++--
libswresample/x86/resample_x86_dsp.c | 43 +++++++++++++++++-------------------
2 files changed, 43 insertions(+), 25 deletions(-)
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index b3c431e..2cb656e 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -179,13 +179,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1
+%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
-%if cpuflag(avx)
+%if mmsize == 32
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
@@ -429,15 +433,20 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
paddd m2, m3
paddd m0, m1
%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
+ fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3
addp%4 m0, m0, m1
+%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
-%if cpuflag(avx)
+%if mmsize == 32
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
@@ -483,8 +492,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
subp%4 xm2, xm0
mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+ fmaddp%4 xm0, xm2, xm1, xm0
+%else
mulp%4 xm2, xm1
addp%4 xm0, xm2
+%endif ; cpuflag
; horizontal sum & store
movhlps xm1, xm0
@@ -564,6 +577,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
%if ARCH_X86_32
INIT_MMX mmxext
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 9049da6..c3d8578 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,30 +27,19 @@
#include "libswresample/resample.h"
-int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+ const uint8_t *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+ const uint8_t *src, int sz, int upd)
-int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
-int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
- const uint8_t *src, int sz, int upd);
+RESAMPLE_FUNCS(int16, mmxext);
+RESAMPLE_FUNCS(int16, sse2);
+RESAMPLE_FUNCS(float, sse);
+RESAMPLE_FUNCS(float, avx);
+RESAMPLE_FUNCS(float, fma3);
+RESAMPLE_FUNCS(float, fma4);
+RESAMPLE_FUNCS(double, sse2);
void swresample_dsp_x86_init(ResampleContext *c)
{
@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
}
+ if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
+ c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
+ c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
+ }
+ if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
+ c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
+ c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
+ }
}
--
1.9.4.msysgit.0
More information about the ffmpeg-devel
mailing list