[FFmpeg-devel] [PATCH] x86/dcadsp: add ff_decode_hf_avx2()

James Almer jamrial at gmail.com
Wed Feb 19 05:39:12 CET 2014


Signed-off-by: James Almer <jamrial at gmail.com>
---
This patch depends on "[PATCH 10/10] dcadsp: x86: SSE implementation of decode_hf" by Christophe Gisquet.
Tested with Intel SDE so no benchmarks were run, but I think it's safe to assume it's faster.

Benching and testing on actual hardware welcome.
---
 libavcodec/x86/dcadsp.asm    | 21 ++++++++++++++++++---
 libavcodec/x86/dcadsp_init.c |  7 +++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 5aed8bc..8eecbc3 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -41,18 +41,24 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
 .loop:
 %if ARCH_X86_64
     mov    offsetd, [scaleq + 2*startq]
-    cvtsi2ss    m0, offsetd
+    cvtsi2ss    xmm0, offsetd
 %else
-    cvtsi2ss    m0, [scaleq + 2*startq]
+    cvtsi2ss    xmm0, [scaleq + 2*startq]
 %endif
     mov    offsetd, [numq + startq]
-    mulss       m0, [pf_inv16]
+    mulss       xmm0, [pf_inv16]
     shl       DICT, 5
+%if cpuflag(avx2)
+    vbroadcastss m0, xmm0
+%else
     shufps      m0, m0, 0
+%endif
 %if cpuflag(sse2)
 %if cpuflag(sse4)
     pmovsxbd    m1, [srcq + DICT + 0]
+%if notcpuflag(avx2)
     pmovsxbd    m2, [srcq + DICT + 4]
+%endif
 %else
     movq        m1, [srcq + DICT]
     punpcklbw   m1, m1
@@ -63,7 +69,9 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
     psrad       m2, 24
 %endif
     cvtdq2ps    m1, m1
+%if notcpuflag(avx2)
     cvtdq2ps    m2, m2
+%endif
 %else
     movd       mm0, [srcq + DICT + 0]
     movd       mm1, [srcq + DICT + 4]
@@ -88,9 +96,13 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
     shufps      m2, m4, q1010
 %endif
     mulps       m1, m0
+%if notcpuflag(avx2)
     mulps       m2, m0
+%endif
     mova [dstq + 8*startq +  0], m1
+%if notcpuflag(avx2)
     mova [dstq + 8*startq + 16], m2
+%endif
     add     startq, 4
     cmp     startq, endm
     jl       .loop
@@ -111,3 +123,6 @@ DECODE_HF
 
 INIT_XMM sse4
 DECODE_HF
+
+INIT_YMM avx2
+DECODE_HF
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index fde1297..f578df7 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -32,6 +32,9 @@ void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS
 void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
                        int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_avx2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 
 av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
@@ -50,4 +53,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
     if (EXTERNAL_SSE4(cpu_flags)) {
         s->decode_hf = ff_decode_hf_sse4;
     }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        s->decode_hf = ff_decode_hf_avx2;
+    }
 }
-- 
1.8.3.2




More information about the ffmpeg-devel mailing list