[FFmpeg-devel] [PATCH] SIMD-optimized float_to_int32_fmul_scalar()
Michael Niedermayer
michaelni
Fri Jan 7 19:31:51 CET 2011
On Fri, Jan 07, 2011 at 01:15:37PM -0500, Justin Ruggles wrote:
> This patch implements float_to_int32_fmul_scalar() for 3dnow, sse, and
> sse2 and uses it in the AC3 encoder.
>
> benchmarks (in dezicycles) for scale_coefficients() in ac3enc_float.c:
>
> AMD Athlon 64 X2 6000+ (64-bit Ubuntu)
> C: 137485
> 3DNow: 52110
> SSE: 50257
> SSE2: 53306
>
> Intel Atom 330 (64-bit Ubuntu)
> C: 595011
> SSE: 149121
> SSE2: 148662
>
> Thanks,
> Justin
>
> ac3enc_float.c | 6 ++---
> dsputil.c | 7 ++++++
> dsputil.h | 1
> x86/dsputil_mmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 74 insertions(+), 3 deletions(-)
> db8bf89c50552f3bc830e6b957f6b7ea4bf92d06 float_to_int32_fmul_scalar.patch
> diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
> index f324636..be2be8c 100644
> --- a/libavcodec/ac3enc_float.c
> +++ b/libavcodec/ac3enc_float.c
> @@ -107,9 +107,9 @@ static int normalize_samples(AC3EncodeContext *s)
> */
> static void scale_coefficients(AC3EncodeContext *s)
> {
> - int i;
> - for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
> - s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
> + s->dsp.float_to_int32_fmul_scalar(s->fixed_coef_buffer, s->mdct_coef_buffer,
> + 16777216.0f, AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
> + emms_c();
> }
>
>
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 29ddb4d..06728ce 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -3866,6 +3866,12 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
> dst[i] = src[i] * mul;
> }
>
> +static void float_to_int32_fmul_scalar_c(int32_t *dst, const float *src, float mul, int len){
> + int i;
> + for(i=0; i<len; i++)
> + dst[i] = lrintf(src[i] * mul);
> +}
> +
> static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
> uint32_t maxi, uint32_t maxisign)
> {
> @@ -4440,6 +4446,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
> c->vector_fmul_add = vector_fmul_add_c;
> c->vector_fmul_window = ff_vector_fmul_window_c;
> c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> + c->float_to_int32_fmul_scalar = float_to_int32_fmul_scalar_c;
> c->vector_clipf = vector_clipf_c;
> c->float_to_int16 = ff_float_to_int16_c;
> c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index 6c56a65..23a52f1 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -381,6 +381,7 @@ typedef struct DSPContext {
> void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
> /* assume len is a multiple of 8, and arrays are 16-byte aligned */
> void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
> + void (*float_to_int32_fmul_scalar)(int32_t *dst, const float *src, float mul, int len);
missing alignment requirements and len value requirements documentation (multiple of 16?)
> void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
> /**
> * Multiply a vector of floats by a scalar float. Source and
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 909ec41..41c55c1 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -2303,6 +2303,65 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu
> );
> }
>
> +static void float_to_int32_fmul_scalar_3dnow(int32_t *dst, const float *src, float mul, int len)
> +{
> + /* note: pf2id conversion uses truncation, not round-to-nearest */
> + x86_reg i = (len-4)*4;
> + __asm__ volatile(
> + "movq %3, %%mm1 \n\t"
> + "punpckldq %%mm1, %%mm1 \n\t"
> + "1: \n\t"
> + "movq (%2,%0), %%mm0 \n\t"
> + "pfmul %%mm1, %%mm0 \n\t"
> + "pf2id %%mm0, %%mm0 \n\t"
> + "movq %%mm0, (%1,%0) \n\t"
> + "sub $8, %0 \n\t"
> + "jge 1b \n\t"
> + "femms \n\t"
duplicate *emms
also some of these can be unrolled to gain a bit more speed
[ ...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
He who knows, does not speak. He who speaks, does not know. -- Lao Tsu
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20110107/15ff79f0/attachment.pgp>
More information about the ffmpeg-devel
mailing list