[FFmpeg-devel] [PATCH] SIMD-optimized float_to_int32_fmul_scalar()

Fri Jan 7 19:52:38 CET 2011

On 01/07/2011 01:31 PM, Michael Niedermayer wrote:

> On Fri, Jan 07, 2011 at 01:15:37PM -0500, Justin Ruggles wrote:
>> This patch implements float_to_int32_fmul_scalar() for 3dnow, sse, and
>> sse2 and uses it in the AC3 encoder.
>>
>> benchmarks (in dezicycles) for scale_coefficients() in ac3enc_float.c:
>>
>> AMD Athlon 64 X2 6000+ (64-bit Ubuntu)
>>     C: 137485
>> 3DNow:  52110
>>   SSE:  50257
>>  SSE2:  53306
>>
>> Intel Atom 330 (64-bit Ubuntu)
>>     C: 595011
>>   SSE: 149121
>>  SSE2: 148662
>>
>> Thanks,
>> Justin
>>
> 
>>  ac3enc_float.c    |    6 ++---
>>  dsputil.c         |    7 ++++++
>>  dsputil.h         |    1 
>>  x86/dsputil_mmx.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 74 insertions(+), 3 deletions(-)
>> db8bf89c50552f3bc830e6b957f6b7ea4bf92d06  float_to_int32_fmul_scalar.patch
>> diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
>> index f324636..be2be8c 100644
>> --- a/libavcodec/ac3enc_float.c
>> +++ b/libavcodec/ac3enc_float.c
>> @@ -107,9 +107,9 @@ static int normalize_samples(AC3EncodeContext *s)
>>   */
>>  static void scale_coefficients(AC3EncodeContext *s)
>>  {
>> -    int i;
>> -    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
>> -        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
>> +    s->dsp.float_to_int32_fmul_scalar(s->fixed_coef_buffer, s->mdct_coef_buffer,
>> +                                      16777216.0f, AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
>> +    emms_c();
>>  }
>>  
>>  
>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>> index 29ddb4d..06728ce 100644
>> --- a/libavcodec/dsputil.c
>> +++ b/libavcodec/dsputil.c
>> @@ -3866,6 +3866,12 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
>>          dst[i] = src[i] * mul;
>>  }
>>  
>> +static void float_to_int32_fmul_scalar_c(int32_t *dst, const float *src, float mul, int len){
>> +    int i;
>> +    for(i=0; i<len; i++)
>> +        dst[i] = lrintf(src[i] * mul);
>> +}
>> +
>>  static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
>>                     uint32_t maxi, uint32_t maxisign)
>>  {
>> @@ -4440,6 +4446,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>      c->vector_fmul_add = vector_fmul_add_c;
>>      c->vector_fmul_window = ff_vector_fmul_window_c;
>>      c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
>> +    c->float_to_int32_fmul_scalar = float_to_int32_fmul_scalar_c;
>>      c->vector_clipf = vector_clipf_c;
>>      c->float_to_int16 = ff_float_to_int16_c;
>>      c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
>> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
>> index 6c56a65..23a52f1 100644
>> --- a/libavcodec/dsputil.h
>> +++ b/libavcodec/dsputil.h
>> @@ -381,6 +381,7 @@ typedef struct DSPContext {
>>      void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
>>      /* assume len is a multiple of 8, and arrays are 16-byte aligned */
>>      void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
>> +    void (*float_to_int32_fmul_scalar)(int32_t *dst, const float *src, float mul, int len);
> 
> missing alignment requirements and len value requirements documentation (multiple of 16?)

ok. I'll add that.

> 
>>      void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
>>      /**
>>       * Multiply a vector of floats by a scalar float.  Source and
>> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
>> index 909ec41..41c55c1 100644
>> --- a/libavcodec/x86/dsputil_mmx.c
>> +++ b/libavcodec/x86/dsputil_mmx.c
>> @@ -2303,6 +2303,65 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu
>>      );
>>  }
>>  
>> +static void float_to_int32_fmul_scalar_3dnow(int32_t *dst, const float *src, float mul, int len)
>> +{
>> +    /* note: pf2id conversion uses truncation, not round-to-nearest */
>> +    x86_reg i = (len-4)*4;
>> +    __asm__ volatile(
>> +        "movq          %3,   %%mm1      \n\t"
>> +        "punpckldq  %%mm1,   %%mm1      \n\t"
>> +        "1:                             \n\t"
>> +        "movq     (%2,%0),   %%mm0      \n\t"
>> +        "pfmul      %%mm1,   %%mm0      \n\t"
>> +        "pf2id      %%mm0,   %%mm0      \n\t"
>> +        "movq       %%mm0, (%1,%0)      \n\t"
>> +        "sub $8, %0                     \n\t"
>> +        "jge 1b                         \n\t"
>> +        "femms                          \n\t"
> 
> duplicate *emms

oops...you're right. there is already emms required because of the sse
version. i'll take it out.

> also some of these can be unrolled to gain a bit more speed

unrolling didn't give me any benefit in testing, but that was just on
Athlon.  I'll do more tests and try it on Atom as well.

-Justin