[FFmpeg-devel] [PATCH 4/8] avcodec/flac: partially unroll loop in flac_enc_lpc_32

Rostislav Pehlivanov atomnuker at gmail.com
Mon Nov 27 01:21:15 EET 2017


On 26 November 2017 at 22:51, James Darnley <james.darnley at gmail.com> wrote:

> Now does 6 samples per iteration, up from 2.
>
> From 1.6 to 2.1 times faster again.  2.5 to 3.9 times faster overall.
> Runtime is reduced by a further 4 to 17%.  Reduced by 9 to 65% overall.
>
> Same conditions as previously.
> ---
>  libavcodec/x86/flac_dsp_gpl.asm | 30 +++++++++++++++++++++++++-----
>  1 file changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 618306eb5f..4d212ed212 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -152,13 +152,13 @@ RET
>  %macro FUNCTION_BODY_32 0
>
>  %if ARCH_X86_64
> -    cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs
> +    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
>      DECLARE_REG_TMP 5, 6
>      %define length r2d
>
>      movsxd orderq, orderd
>  %else
> -    cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs
> +    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
>      DECLARE_REG_TMP 2, 5
>      %define length r2mp
>  %endif
> @@ -190,6 +190,8 @@ mova  [rsp],    m4            ; save sign extend mask
>
>  .looplen:
>      pxor m0,   m0
> +    pxor m4,   m4
> +    pxor m6,   m6
>      mov  posj, orderq
>      xor  negj, negj
>
> @@ -197,23 +199,41 @@ mova  [rsp],    m4            ; save sign extend mask
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
>          pmovzxdq m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
> +        pmovzxdq m5,  [smpq+negj*4-4+mmsize/2]
> +        pmovzxdq m7,  [smpq+negj*4-4+mmsize]
>          pmuldq m1,   m2
> +        pmuldq m5,   m2
> +        pmuldq m7,   m2
>          paddq  m0,   m1             ; p += c * s
> +        paddq  m4,   m5
> +        paddq  m6,   m7
>
>          dec    negj
>          inc    posj
>      jnz .looporder
>
>      HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, m3, [rsp], m2
> +    HACK_PSRAQ m6, m3, [rsp], m2
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> +    CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
> +    CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
>      pshufd  m0,    m0, q0020 ; pack into first 2 dwords
> +    pshufd  m4,    m4, q0020
> +    pshufd  m6,    m6, q0020
>      movh    m1,   [smpq]
> +    movh    m5,   [smpq+mmsize/2]
> +    movh    m7,   [smpq+mmsize]
>      psubd   m1,    m0               ; smp[i] - p
> +    psubd   m5,    m4
> +    psubd   m7,    m6
>      movh   [resq], m1               ; res[i] = smp[i] - (p >> shift)
> +    movh   [resq+mmsize/2], m5
> +    movh   [resq+mmsize], m7
>
> -    add resq,   mmsize/2
> -    add smpq,   mmsize/2
> -    sub length, mmsize/8
> +    add resq,   (3*mmsize)/2
> +    add smpq,   (3*mmsize)/2
> +    sub length, (3*mmsize)/8
>  jg .looplen
>  RET
>
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

lgtm, tnx


More information about the ffmpeg-devel mailing list