[FFmpeg-devel] [PATCH] lavu/x86/lls: add fma3 optimizations for update_lls

Ganesh Ajjanagadde gajjanagadde at gmail.com
Thu Jan 14 01:03:53 CET 2016


On Wed, Jan 13, 2016 at 6:59 PM, Ganesh Ajjanagadde
<gajjanagadde at gmail.com> wrote:
> This improves accuracy (very slightly) and speed for processors having
> fma3.
>
> Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
> old:
> 5993610 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
> 5951528 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips
>
> new:
> 5252410 decicycles in ff_lpc_calc_coefs,      64 runs,      0 skips
> 5232869 decicycles in ff_lpc_calc_coefs,     128 runs,      0 skips
>
> Tested with FATE and --disable-fma3, also examined contents of
> lavu/lls-test.
>
> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde at gmail.com>
> ---
>  libavutil/x86/lls.asm    | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
>  libavutil/x86/lls_init.c |  4 ++++
>  2 files changed, 63 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
> index 769befb..358603a 100644
> --- a/libavutil/x86/lls.asm
> +++ b/libavutil/x86/lls.asm
> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
>  .ret:
>      REP_RET
>
> -%if HAVE_AVX_EXTERNAL
> -INIT_YMM avx
> +%macro UPDATE_LLS 0
>  cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      %define covarq ctxq
>      mov  countd, [ctxq + LLSModel.indep_count]
> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      vbroadcastsd ymm6, [varq + iq*8 + 16]
>      vbroadcastsd ymm7, [varq + iq*8 + 24]
>      vextractf128 xmm3, ymm1, 1
> +%if cpuflag(fma3)
> +    mova ymm0, COVAR(iq  ,0)
> +    mova xmm2, COVAR(iq+2,2)
> +    vfmadd231pd ymm0, ymm1, ymm4
> +    vfmadd231pd xmm2, xmm3, xmm6
> +    vfmadd213pd ymm1, ymm5, COVAR(iq  ,1)
> +    vfmadd213pd xmm3, xmm7, COVAR(iq+2,3)
> +    mova COVAR(iq  ,0), ymm0
> +    mova COVAR(iq  ,1), ymm1
> +    mova COVAR(iq+2,2), xmm2
> +    mova COVAR(iq+2,3), xmm3
> +%else
>      vmulpd  ymm0, ymm1, ymm4
>      vmulpd  ymm1, ymm1, ymm5
>      vmulpd  xmm2, xmm3, xmm6
> @@ -148,12 +159,27 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      ADDPD_MEM COVAR(iq  ,1), ymm1
>      ADDPD_MEM COVAR(iq+2,2), xmm2
>      ADDPD_MEM COVAR(iq+2,3), xmm3
> +%endif ; cpuflag(fma3)
>      lea     jd, [iq + 4]
>      cmp     jd, count2d
>      jg .skip4x4
>  .loop4x4:
>      ; Compute all 16 pairwise products of a 4x4 block
>      mova    ymm3, [varq + jq*8]
> +%if cpuflag(fma3)
> +    mova ymm0, COVAR(jq, 0)
> +    mova ymm1, COVAR(jq, 1)
> +    mova ymm2, COVAR(jq, 2)
> +    mova ymm3, COVAR(jq, 3)
> +    vfmadd231pd ymm0, ymm3, ymm4
> +    vfmadd231pd ymm1, ymm3, ymm5
> +    vfmadd231pd ymm2, ymm3, ymm6
> +    vfmadd231pd ymm3, ymm3, ymm7
> +    mova COVAR(jq, 0), ymm0
> +    mova COVAR(jq, 1), ymm1
> +    mova COVAR(jq, 2), ymm2
> +    mova COVAR(jq, 3), ymm3
> +%else
>      vmulpd  ymm0, ymm3, ymm4
>      vmulpd  ymm1, ymm3, ymm5
>      vmulpd  ymm2, ymm3, ymm6
> @@ -162,6 +188,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      ADDPD_MEM COVAR(jq,1), ymm1
>      ADDPD_MEM COVAR(jq,2), ymm2
>      ADDPD_MEM COVAR(jq,3), ymm3
> +%endif ; cpuflag(fma3)
>      add     jd, 4
>      cmp     jd, count2d
>      jle .loop4x4
> @@ -169,6 +196,20 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      cmp     jd, countd
>      jg .skip2x4
>      mova    xmm3, [varq + jq*8]
> +%if cpuflag(fma3)
> +    mova xmm0, COVAR(jq, 0)
> +    mova xmm1, COVAR(jq, 1)
> +    mova xmm2, COVAR(jq, 2)
> +    mova xmm3, COVAR(jq, 3)
> +    vfmadd231pd xmm0, xmm3, xmm4
> +    vfmadd231pd xmm1, xmm3, xmm5
> +    vfmadd231pd xmm2, xmm3, xmm6
> +    vfmadd231pd xmm3, xmm3, xmm7
> +    mova COVAR(jq, 0), xmm0
> +    mova COVAR(jq, 1), xmm1
> +    mova COVAR(jq, 2), xmm2
> +    mova COVAR(jq, 3), xmm3
> +%else
>      vmulpd  xmm0, xmm3, xmm4
>      vmulpd  xmm1, xmm3, xmm5
>      vmulpd  xmm2, xmm3, xmm6
> @@ -177,6 +218,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      ADDPD_MEM COVAR(jq,1), xmm1
>      ADDPD_MEM COVAR(jq,2), xmm2
>      ADDPD_MEM COVAR(jq,3), xmm3
> +%endif ; cpuflag(fma3)
>  .skip2x4:
>      add     id, 4
>      add covarq, 4*COVAR_STRIDE
> @@ -187,14 +229,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>      mov     jd, id
>  .loop2x1:
>      vmovddup xmm0, [varq + iq*8]
> +%if cpuflag(fma3)
> +    mova xmm1, [varq + jq*8]
> +    vfmadd213pd xmm0, xmm1, COVAR(jq,0)
> +    mova COVAR(jq,0), xmm0
> +%else
>      vmulpd   xmm0, [varq + jq*8]
>      ADDPD_MEM COVAR(jq,0), xmm0
> +%endif ; cpuflag(fma3)
>      inc     id
>      add covarq, COVAR_STRIDE
>      cmp     id, countd
>      jle .loop2x1
>  .ret:
>      REP_RET
> +%endmacro ; UPDATE_LLS
> +
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +UPDATE_LLS
> +%endif
> +%if HAVE_FMA3_EXTERNAL
> +INIT_YMM fma3
> +UPDATE_LLS
>  %endif
>
>  INIT_XMM sse2
> diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
> index 81f141c..9f0d862 100644
> --- a/libavutil/x86/lls_init.c
> +++ b/libavutil/x86/lls_init.c
> @@ -25,6 +25,7 @@
>
>  void ff_update_lls_sse2(LLSModel *m, const double *var);
>  void ff_update_lls_avx(LLSModel *m, const double *var);
> +void ff_update_lls_fma3(LLSModel *m, const double *var);
>  double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
>
>  av_cold void ff_init_lls_x86(LLSModel *m)
> @@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
>      if (EXTERNAL_AVX_FAST(cpu_flags)) {
>          m->update_lls = ff_update_lls_avx;
>      }
> +    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
> +        m->update_lls = ff_update_lls_fma3;
> +    }
>  }
> --
> 2.7.0
>

Should mention one thing: rank one updates of the Cholesky are likely
not that useful, since I examined and found ~ 4000 update calls for 1
solve call, for the fate-flac entry. I want to add this is a comment
to the update_lls function, so that in future I or someone thinking on
those lines is aware that it is better to not do rank one updates of
Cholesky. Can add to this or separate patch, whatever people like.


More information about the ffmpeg-devel mailing list