[FFmpeg-devel] [PATCH] flac/x86: add ff_flac_lpc_32_xop()
James Almer
jamrial at gmail.com
Fri Feb 7 19:30:24 CET 2014
Tested on an AMD FX 6300
679081 decicycles in ff_flac_lpc_32_xop, 32768 runs
774425 decicycles in ff_flac_lpc_32_sse4, 32768 runs
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/flacdsp.asm | 21 ++++++++++++---------
libavcodec/x86/flacdsp_init.c | 6 ++++++
libavutil/x86/x86inc.asm | 4 ++++
3 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
index e28f905..1a83cd8 100644
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@@ -24,7 +24,8 @@
SECTION .text
-INIT_XMM sse4
+%macro LPC_32 1
+INIT_XMM %1
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
sub lend, pred_orderd
jle .ret
@@ -43,25 +44,21 @@ ALIGN 16
test jq, jq
jz .end_order
.loop_order:
- pmuldq m0, m1
- paddq m2, m0
+ pmacsdql m2, m0, m1, m2
movd m0, [decodedq+jq*4]
- pmuldq m1, m0
- paddq m3, m1
+ pmacsdql m3, m1, m0, m3
movd m1, [coeffsq+jq*4]
inc jq
jl .loop_order
.end_order:
- pmuldq m0, m1
- paddq m2, m0
+ pmacsdql m2, m0, m1, m2
psrlq m2, m4
movd m0, [decodedq]
paddd m0, m2
movd [decodedq], m0
sub lend, 2
jl .ret
- pmuldq m1, m0
- paddq m3, m1
+ pmacsdql m3, m1, m0, m3
psrlq m3, m4
movd m1, [decodedq+4]
paddd m1, m3
@@ -69,3 +66,9 @@ ALIGN 16
jg .loop_sample
.ret:
REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index d30a41e..095b762 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -24,6 +24,8 @@
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
int bps)
@@ -35,5 +37,9 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
if (bps > 16)
c->lpc = ff_flac_lpc_32_sse4;
}
+ if (EXTERNAL_XOP(cpu_flags)) {
+ if (bps > 16)
+ c->lpc = ff_flac_lpc_32_xop;
+ }
#endif
}
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c455367..88cae0c 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1411,6 +1411,9 @@ AVX_INSTR pfmul, 1, 0, 1
%macro %1 4-7 %1, %2, %3
%if cpuflag(xop)
v%5 %1, %2, %3, %4
+ %elifidn %1, %4
+ %6 %2, %3
+ %7 %1, %2
%else
%6 %1, %2, %3
%7 %1, %4
@@ -1420,6 +1423,7 @@ AVX_INSTR pfmul, 1, 0, 1
FMA_INSTR pmacsdd, pmulld, paddd
FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmacsdql, pmuldq, paddq
FMA_INSTR pmadcswd, pmaddwd, paddd
; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
--
1.8.3.2
More information about the ffmpeg-devel
mailing list