[FFmpeg-devel] [PATCH 2/2] ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder.

Fri Mar 11 23:44:52 CET 2011

Justin Ruggles <justin.ruggles at gmail.com> writes:

> ---
>  libavcodec/ac3dsp.c         |   24 +++++++++++++++++++++
>  libavcodec/ac3dsp.h         |   22 +++++++++++++++++++
>  libavcodec/ac3enc_fixed.c   |   40 ++---------------------------------
>  libavcodec/x86/ac3dsp.asm   |   48 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/ac3dsp_mmx.c |   12 ++++++++++
>  5 files changed, 109 insertions(+), 37 deletions(-)
>
>
> diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
> index da3a123..83f48a6 100644
> --- a/libavcodec/ac3dsp.c
> +++ b/libavcodec/ac3dsp.c
> @@ -50,10 +50,34 @@ static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
>      return v;
>  }
>
> +static void ac3_lshift_int16_c(int16_t *src, unsigned int len,
> +                               unsigned int shift)
> +{
> +    int i;
> +
> +    if (shift > 0) {
> +        for (i = 0; i < len; i++)
> +            src[i] <<= shift;
> +    }
> +}
> +
> +static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
> +                               unsigned int shift)
> +{
> +    int i;
> +
> +    if (shift > 0) {
> +        for (i = 0; i < len; i++)
> +            src[i] >>= shift;
> +    }
> +}
> +
>  av_cold void ff_ac3dsp_init(AC3DSPContext *c)
>  {
>      c->ac3_exponent_min = ac3_exponent_min_c;
>      c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
> +    c->ac3_lshift_int16 = ac3_lshift_int16_c;
> +    c->ac3_rshift_int32 = ac3_rshift_int32_c;
>
>      if (HAVE_MMX)
>          ff_ac3dsp_init_x86(c);
> diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
> index a4f141f..31a0af3 100644
> --- a/libavcodec/ac3dsp.h
> +++ b/libavcodec/ac3dsp.h
> @@ -46,6 +46,28 @@ typedef struct AC3DSPContext {
>       * @return    a value with the same MSB as max(abs(src[]))
>       */
>      int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
> +
> +    /**
> +     * Left-shift each value in an array of int16_t by a specified amount.
> +     * @param src    input array
> +     *               constraints: align 16
> +     * @param len    number of values in the array
> +     *               constraints: multiple of 32 greater than 0
> +     * @param shift  left shift amount
> +     *               constraints: range [0,15]
> +     */
> +    void (*ac3_lshift_int16)(int16_t *src, unsigned int len, unsigned int shift);
> +
> +    /**
> +     * Right-shift each value in an array of int32_t by a specified amount.
> +     * @param src    input array
> +     *               constraints: align 16
> +     * @param len    number of values in the array
> +     *               constraints: multiple of 16 greater than 0
> +     * @param shift  right shift amount
> +     *               constraints: range [0,31]
> +     */
> +    void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
>  } AC3DSPContext;
>
>  void ff_ac3dsp_init    (AC3DSPContext *c);
> diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
> index e750a39..baa1363 100644
> --- a/libavcodec/ac3enc_fixed.c
> +++ b/libavcodec/ac3enc_fixed.c
> @@ -278,40 +278,6 @@ static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
>
>  /**
> - * Left-shift each value in an array by a specified amount.
> - * @param tab    input array
> - * @param n      number of values in the array
> - * @param lshift left shift amount
> - */
> -static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
> -{
> -    int i;
> -
> -    if (lshift > 0) {
> -        for (i = 0; i < n; i++)
> -            tab[i] <<= lshift;
> -    }
> -}
> -
> -
> -/**
> - * Right-shift each value in an array of int32_t by a specified amount.
> - * @param src    input array
> - * @param len    number of values in the array
> - * @param shift  right shift amount
> - */
> -static void ac3_rshift_int32_c(int32_t *src, unsigned int len, unsigned int shift)
> -{
> -    int i;
> -
> -    if (shift > 0) {
> -        for (i = 0; i < len; i++)
> -            src[i] >>= shift;
> -    }
> -}
> -
> -
> -/**
>   * Normalize the input samples to use the maximum available precision.
>   * This assumes signed 16-bit input samples.
>   *
> @@ -320,7 +286,7 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len, unsigned int shif
>  static int normalize_samples(AC3EncodeContext *s)
>  {
>      int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
> -    lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
> +    s->ac3dsp.ac3_lshift_int16(s->windowed_samples, AC3_WINDOW_SIZE, v);
>      /* +6 to right-shift from 31-bit to 25-bit */
>      return v + 6;
>  }
> @@ -336,8 +302,8 @@ static void scale_coefficients(AC3EncodeContext *s)
>      for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
>          AC3Block *block = &s->blocks[blk];
>          for (ch = 0; ch < s->channels; ch++) {
> -            ac3_rshift_int32_c(block->mdct_coef[ch], AC3_MAX_COEFS,
> -                               block->coeff_shift[ch]);
> +            s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
> +                                       block->coeff_shift[ch]);
>          }
>      }
>  }

Looks good so far.

> diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
> index b1eeca9..320180c 100644
> --- a/libavcodec/x86/ac3dsp.asm
> +++ b/libavcodec/x86/ac3dsp.asm
> @@ -133,3 +133,51 @@ INIT_XMM
>  AC3_MAX_MSB_ABS_INT16 sse2, min_max
>  %define ABS2 ABS2_SSSE3
>  AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
> +
> +;-----------------------------------------------------------------------------
> +; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
> +;-----------------------------------------------------------------------------
> +
> +%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
> +cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
> +    test  shiftd, shiftd
> +    jz .end
> +    movd      m0, shiftd
> +    ALIGN 8
> +.loop:
> +    mova      m1, [srcq         ]
> +    mova      m2, [srcq+mmsize  ]
> +    mova      m3, [srcq+mmsize*2]
> +    mova      m4, [srcq+mmsize*3]
> +    %3        m1, m0
> +    %3        m2, m0
> +    %3        m3, m0
> +    %3        m4, m0
> +    mova  [srcq         ], m1
> +    mova  [srcq+mmsize  ], m2
> +    mova  [srcq+mmsize*2], m3
> +    mova  [srcq+mmsize*3], m4
> +    add     srcq, mmsize*4
> +    sub     lend, mmsize*32/%2
> +    ja .loop
> +.end:
> +    REP_RET
> +%endmacro
> +
> +;-----------------------------------------------------------------------------
> +; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
> +;-----------------------------------------------------------------------------
> +
> +INIT_MMX
> +AC3_SHIFT l, 16, psllw, mmx
> +INIT_XMM
> +AC3_SHIFT l, 16, psllw, sse2
> +
> +;-----------------------------------------------------------------------------
> +; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
> +;-----------------------------------------------------------------------------
> +
> +INIT_MMX
> +AC3_SHIFT r, 32, psrad, mmx
> +INIT_XMM
> +AC3_SHIFT r, 32, psrad, sse2

Within the limits of my x86 knowledge, this seems OK.

> diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
> index d8af59c..835b106 100644
> --- a/libavcodec/x86/ac3dsp_mmx.c
> +++ b/libavcodec/x86/ac3dsp_mmx.c
> @@ -32,6 +32,12 @@ extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
>  extern int ff_ac3_max_msb_abs_int16_sse2  (const int16_t *src, int len);
>  extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
>
> +extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
> +extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
> +
> +extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
> +extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
> +
>  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
>  {
>      int mm_flags = av_get_cpu_flags();
> @@ -40,6 +46,8 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
>      if (mm_flags & AV_CPU_FLAG_MMX) {
>          c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
>          c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
> +        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
> +        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
>      }
>      if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
>          c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
> @@ -48,6 +56,10 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
>      if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
>          c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
>          c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
> +        if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
> +            c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
> +            c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
> +        }
>      }
>      if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
>          c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;

OK

-- 
M?ns Rullg?rd
mans at mansr.com