[FFmpeg-devel] [PATCH] SIMD-optimized float_to_int32_fmul_scalar()
Justin Ruggles
justin.ruggles
Fri Jan 7 19:52:38 CET 2011
On 01/07/2011 01:31 PM, Michael Niedermayer wrote:
> On Fri, Jan 07, 2011 at 01:15:37PM -0500, Justin Ruggles wrote:
>> This patch implements float_to_int32_fmul_scalar() for 3dnow, sse, and
>> sse2 and uses it in the AC3 encoder.
>>
>> benchmarks (in dezicycles) for scale_coefficients() in ac3enc_float.c:
>>
>> AMD Athlon 64 X2 6000+ (64-bit Ubuntu)
>> C: 137485
>> 3DNow: 52110
>> SSE: 50257
>> SSE2: 53306
>>
>> Intel Atom 330 (64-bit Ubuntu)
>> C: 595011
>> SSE: 149121
>> SSE2: 148662
>>
>> Thanks,
>> Justin
>>
>
>> ac3enc_float.c | 6 ++---
>> dsputil.c | 7 ++++++
>> dsputil.h | 1
>> x86/dsputil_mmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 74 insertions(+), 3 deletions(-)
>> db8bf89c50552f3bc830e6b957f6b7ea4bf92d06 float_to_int32_fmul_scalar.patch
>> diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
>> index f324636..be2be8c 100644
>> --- a/libavcodec/ac3enc_float.c
>> +++ b/libavcodec/ac3enc_float.c
>> @@ -107,9 +107,9 @@ static int normalize_samples(AC3EncodeContext *s)
>> */
>> static void scale_coefficients(AC3EncodeContext *s)
>> {
>> - int i;
>> - for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
>> - s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
>> + s->dsp.float_to_int32_fmul_scalar(s->fixed_coef_buffer, s->mdct_coef_buffer,
>> + 16777216.0f, AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
>> + emms_c();
>> }
>>
>>
>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>> index 29ddb4d..06728ce 100644
>> --- a/libavcodec/dsputil.c
>> +++ b/libavcodec/dsputil.c
>> @@ -3866,6 +3866,12 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
>> dst[i] = src[i] * mul;
>> }
>>
>> +static void float_to_int32_fmul_scalar_c(int32_t *dst, const float *src, float mul, int len){
>> + int i;
>> + for(i=0; i<len; i++)
>> + dst[i] = lrintf(src[i] * mul);
>> +}
>> +
>> static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
>> uint32_t maxi, uint32_t maxisign)
>> {
>> @@ -4440,6 +4446,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
>> c->vector_fmul_add = vector_fmul_add_c;
>> c->vector_fmul_window = ff_vector_fmul_window_c;
>> c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
>> + c->float_to_int32_fmul_scalar = float_to_int32_fmul_scalar_c;
>> c->vector_clipf = vector_clipf_c;
>> c->float_to_int16 = ff_float_to_int16_c;
>> c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
>> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
>> index 6c56a65..23a52f1 100644
>> --- a/libavcodec/dsputil.h
>> +++ b/libavcodec/dsputil.h
>> @@ -381,6 +381,7 @@ typedef struct DSPContext {
>> void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
>> /* assume len is a multiple of 8, and arrays are 16-byte aligned */
>> void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
>> + void (*float_to_int32_fmul_scalar)(int32_t *dst, const float *src, float mul, int len);
>
> missing alignment requirements and len value requirements documentation (multiple of 16?)
ok. I'll add that.
>
>> void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
>> /**
>> * Multiply a vector of floats by a scalar float. Source and
>> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
>> index 909ec41..41c55c1 100644
>> --- a/libavcodec/x86/dsputil_mmx.c
>> +++ b/libavcodec/x86/dsputil_mmx.c
>> @@ -2303,6 +2303,65 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu
>> );
>> }
>>
>> +static void float_to_int32_fmul_scalar_3dnow(int32_t *dst, const float *src, float mul, int len)
>> +{
>> + /* note: pf2id conversion uses truncation, not round-to-nearest */
>> + x86_reg i = (len-4)*4;
>> + __asm__ volatile(
>> + "movq %3, %%mm1 \n\t"
>> + "punpckldq %%mm1, %%mm1 \n\t"
>> + "1: \n\t"
>> + "movq (%2,%0), %%mm0 \n\t"
>> + "pfmul %%mm1, %%mm0 \n\t"
>> + "pf2id %%mm0, %%mm0 \n\t"
>> + "movq %%mm0, (%1,%0) \n\t"
>> + "sub $8, %0 \n\t"
>> + "jge 1b \n\t"
>> + "femms \n\t"
>
> duplicate *emms
oops...you're right. there is already emms required because of the sse
version. i'll take it out.
> also some of these can be unrolled to gain a bit more speed
unrolling didn't give me any benefit in testing, but that was just on
Athlon. I'll do more tests and try it on Atom as well.
-Justin
More information about the ffmpeg-devel
mailing list