[FFmpeg-devel] [PATCH] SSE-optimized vector_clipf()

Michael Niedermayer michaelni
Fri Aug 7 14:13:50 CEST 2009


On Thu, Aug 06, 2009 at 02:55:30AM +0200, Vitor Sessak wrote:
> Vitor Sessak wrote:
>> $subj, 10% speedup for twinvq decoding (but should be useful also for AMR 
>> and wmapro).
>
> err, I mean, attached.
>
> -Vitor

>  dsputil.c         |   15 +++++++++++++++
>  dsputil.h         |    3 ++-
>  x86/dsputil_mmx.c |   34 ++++++++++++++++++++++++++++++++++
>  3 files changed, 51 insertions(+), 1 deletion(-)
> 8a95f5f2f3d267089056d6a571b2e6cc37d1569e  dsp_vector_clipf.diff
> Index: libavcodec/dsputil.c
> ===================================================================
> --- libavcodec/dsputil.c	(revision 19598)
> +++ libavcodec/dsputil.c	(working copy)
> @@ -4093,6 +4093,20 @@
>          dst[i] = src[i] * mul;
>  }
>  
> +void vector_clipf_c(float *dst, float min, float max, int len) {
> +    int i;
> +    for (i=0; i < len; i+=8) {
> +        dst[i    ] = av_clipf(dst[i    ], min, max);
> +        dst[i + 1] = av_clipf(dst[i + 1], min, max);
> +        dst[i + 2] = av_clipf(dst[i + 2], min, max);
> +        dst[i + 3] = av_clipf(dst[i + 3], min, max);
> +        dst[i + 4] = av_clipf(dst[i + 4], min, max);
> +        dst[i + 5] = av_clipf(dst[i + 5], min, max);
> +        dst[i + 6] = av_clipf(dst[i + 6], min, max);
> +        dst[i + 7] = av_clipf(dst[i + 7], min, max);
> +    }
> +}

this one could be tried by using integer math instead of floats
(assuming IEEE floats of course)

> +
>  static av_always_inline int float_to_int16_one(const float *src){
>      int_fast32_t tmp = *(const int32_t*)src;
>      if(tmp & 0xf0000){
> @@ -4669,6 +4683,7 @@
>      c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
>      c->vector_fmul_window = ff_vector_fmul_window_c;
>      c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> +    c->vector_clipf = vector_clipf_c;
>      c->float_to_int16 = ff_float_to_int16_c;
>      c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
>      c->add_int16 = add_int16_c;
> Index: libavcodec/dsputil.h
> ===================================================================
> --- libavcodec/dsputil.h	(revision 19598)
> +++ libavcodec/dsputil.h	(working copy)
> @@ -396,7 +396,8 @@
>      void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
>      /* assume len is a multiple of 8, and arrays are 16-byte aligned */
>      void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
> -
> +    /* assume len is a multiple of 16, and dst is 16-byte aligned */
> +    void (*vector_clipf)(float *dst, float min, float max, int len);

align requirements are generally writen like:

void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);


>      /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
>       * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
>      void (*float_to_int16)(int16_t *dst, const float *src, long len);
> Index: libavcodec/x86/dsputil_mmx.c
> ===================================================================
> --- libavcodec/x86/dsputil_mmx.c	(revision 19598)
> +++ libavcodec/x86/dsputil_mmx.c	(working copy)
> @@ -2346,6 +2346,39 @@
>      );
>  }
>  
> +void vector_clipf_sse(float *dst, float min, float max, int len)
> +{
> +    x86_reg i = (len-16)*4;
> +    __asm__ volatile(
> +        "movss  %2, %%xmm4 \n"
> +        "movss  %3, %%xmm5 \n"
> +        "shufps $0, %%xmm4, %%xmm4 \n"
> +        "shufps $0, %%xmm5, %%xmm5 \n"
> +        "1: \n\t"
> +        "movaps    (%1,%0), %%xmm0 \n\t" // 3/1 on intel
> +        "movaps  16(%1,%0), %%xmm1 \n\t"
> +        "movaps  32(%1,%0), %%xmm2 \n\t"
> +        "movaps  48(%1,%0), %%xmm3 \n\t"
> +        "maxps      %%xmm4, %%xmm0 \n\t"
> +        "maxps      %%xmm4, %%xmm1 \n\t"
> +        "maxps      %%xmm4, %%xmm2 \n\t"
> +        "maxps      %%xmm4, %%xmm3 \n\t"
> +        "minps      %%xmm5, %%xmm0 \n\t"
> +        "minps      %%xmm5, %%xmm1 \n\t"
> +        "minps      %%xmm5, %%xmm2 \n\t"
> +        "minps      %%xmm5, %%xmm3 \n\t"
> +        "movaps  %%xmm0,   (%1,%0) \n\t"
> +        "movaps  %%xmm1, 16(%1,%0) \n\t"
> +        "movaps  %%xmm2, 32(%1,%0) \n\t"
> +        "movaps  %%xmm3, 48(%1,%0) \n\t"
> +        "sub  $64, %0 \n\t"

did you benchmark the backward direction vs forward?

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

While the State exists there can be no freedom; when there is freedom there
will be no State. -- Vladimir Lenin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090807/db3e127a/attachment.pgp>



More information about the ffmpeg-devel mailing list