[FFmpeg-cvslog] ac3enc: Add x86-optimized function to speed up log2_tab().

Justin Ruggles git
Tue Feb 15 00:59:10 CET 2011


ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Sun Feb 13 14:49:50 2011 -0500| [7539a1fee2c4935eb3318d625f881df85f2c9c04] | committer: Michael Niedermayer

ac3enc: Add x86-optimized function to speed up log2_tab().

AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute
value of each element in an array of int16_t.

Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>
(cherry picked from commit fbb6b49dabc3398440c6dfa838aa090a7a6ebc0d)

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7539a1fee2c4935eb3318d625f881df85f2c9c04
---

 libavcodec/ac3dsp.c         |    9 +++++
 libavcodec/ac3dsp.h         |   11 +++++++
 libavcodec/ac3enc_fixed.c   |   11 ++-----
 libavcodec/x86/ac3dsp.asm   |   69 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_mmx.c |   11 +++++++
 5 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index f688e6a..da3a123 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
     }
 }
 
+static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
+{
+    int i, v = 0;
+    for (i = 0; i < len; i++)
+        v |= abs(src[i]);
+    return v;
+}
+
 av_cold void ff_ac3dsp_init(AC3DSPContext *c)
 {
     c->ac3_exponent_min = ac3_exponent_min_c;
+    c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
 
     if (HAVE_MMX)
         ff_ac3dsp_init_x86(c);
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 7f13b11..a4f141f 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
      * @param nb_coefs  number of frequency coefficients.
      */
     void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+
+    /**
+     * Calculate the maximum MSB of the absolute value of each element in an
+     * array of int16_t.
+     * @param src input array
+     *            constraints: align 16. values must be in range [-32767,32767]
+     * @param len number of values in the array
+     *            constraints: multiple of 16 greater than 0
+     * @return    a value with the same MSB as max(abs(src[]))
+     */
+    int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
 } AC3DSPContext;
 
 void ff_ac3dsp_init    (AC3DSPContext *c);
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index 0db41df..3de00ee 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
  * @param n   number of values in the array
  * @return    log2(max(abs(tab[])))
  */
-static int log2_tab(int16_t *tab, int n)
+static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
 {
-    int i, v;
-
-    v = 0;
-    for (i = 0; i < n; i++)
-        v |= abs(tab[i]);
-
+    int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len);
     return av_log2(v);
 }
 
@@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
  */
 static int normalize_samples(AC3EncodeContext *s)
 {
-    int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE);
+    int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
     lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
     return v - 9;
 }
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index e71c51c..dc71ccf 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2
 %endif
 %undef PMINUB
 %undef LOOP_ALIGN
+
+;-----------------------------------------------------------------------------
+; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
+;
+; This function uses 2 different methods to calculate a valid result.
+; 1) logical 'or' of abs of each element
+;        This is used for ssse3 because of the pabsw instruction.
+;        It is also used for mmx because of the lack of min/max instructions.
+; 2) calculate min/max for the array, then or(abs(min),abs(max))
+;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
+;-----------------------------------------------------------------------------
+
+%macro AC3_MAX_MSB_ABS_INT16 2
+cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
+    pxor        m2, m2
+    pxor        m3, m3
+.loop:
+%ifidn %2, min_max
+    mova        m0, [srcq]
+    mova        m1, [srcq+mmsize]
+    pminsw      m2, m0
+    pminsw      m2, m1
+    pmaxsw      m3, m0
+    pmaxsw      m3, m1
+%else ; or_abs
+%ifidn %1, mmx
+    mova        m0, [srcq]
+    mova        m1, [srcq+mmsize]
+    ABS2        m0, m1, m3, m4
+%else ; ssse3
+    ; using memory args is faster for ssse3
+    pabsw       m0, [srcq]
+    pabsw       m1, [srcq+mmsize]
+%endif
+    por         m2, m0
+    por         m2, m1
+%endif
+    add       srcq, mmsize*2
+    sub       lend, mmsize
+    ja .loop
+%ifidn %2, min_max
+    ABS2        m2, m3, m0, m1
+    por         m2, m3
+%endif
+%ifidn mmsize, 16
+    mova        m0, m2
+    punpckhqdq  m0, m0
+    por         m2, m0
+%endif
+    PSHUFLW     m0, m2, 0xe
+    por         m2, m0
+    PSHUFLW     m0, m2, 0x1
+    por         m2, m0
+    movd       eax, m2
+    and        eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX
+%define ABS2 ABS2_MMX
+%define PSHUFLW pshufw
+AC3_MAX_MSB_ABS_INT16 mmx, or_abs
+%define ABS2 ABS2_MMX2
+AC3_MAX_MSB_ABS_INT16 mmxext, min_max
+INIT_XMM
+%define PSHUFLW pshuflw
+AC3_MAX_MSB_ABS_INT16 sse2, min_max
+%define ABS2 ABS2_SSSE3
+AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 7ce3aa3..d8af59c 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int n
 extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 extern void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 
+extern int ff_ac3_max_msb_abs_int16_mmx   (const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_sse2  (const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
+
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
 {
     int mm_flags = av_get_cpu_flags();
@@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
 #if HAVE_YASM
     if (mm_flags & AV_CPU_FLAG_MMX) {
         c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
     }
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
     }
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+    }
+    if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
     }
 #endif
 }




More information about the ffmpeg-cvslog mailing list