[FFmpeg-devel] [PATCH] avfilter/avf_showcqt: cqt_calc optimization on x86

Wed Jun 8 11:16:32 CEST 2016

On Tue, Jun 7, 2016 at 4:18 PM, Muhammad Faiz <mfcc64 at gmail.com> wrote:
> On Tue, Jun 7, 2016 at 10:36 AM, James Almer <jamrial at gmail.com> wrote:
>> On 6/4/2016 4:36 AM, Muhammad Faiz wrote:
>>> benchmark on x86_64
>>> cqt_time:
>>> plain = 3.292 s
>>> SSE   = 1.640 s
>>> SSE3  = 1.631 s
>>> AVX   = 1.395 s
>>> FMA3  = 1.271 s
>>> FMA4  = not available
>>
>> Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc
>> call in libavfilter/avf_showcqt.c
>> It will potentially give more accurate results than the current
>> UPDATE_TIME(s->cqt_time) check.
>>
> OK, but probably I will check it privately (not sending patch)
>
>>>
>>> untested on x86_32
>>
>> Do you have a sample command to test this? As Michael said FATE doesn't
>> cover showcqt.
>>
>
> I check it using psnr filter (avg psnr above 90dB means OK)
>
> #!/bin/bash
> # example usage: ./psnr-check audio.mp3 yuv420p "-cpuflags -fma3-fma4-avx-sse3"
>
> mkfifo in0.y4m
> mkfifo in1.y4m
>
> # this is new ffmpeg
> build_path=$HOME/Documents/sources/ffmpeg/ffmpeg-build
> LD_LIBRARY_PATH=$build_path/libavcodec:$build_path/libavdevice:$build_path/libavfilter
> LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libavformat:$build_path/libavutil
> LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libpostproc:$build_path/libswresample
> LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libswscale
>
> export LD_LIBRARY_PATH
>
> $build_path/ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2,
> format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in0.y4m >/dev/null
> 2>&1 </dev/null &
>
> # this is old ffmpeg
> unset LD_LIBRARY_PATH
> ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2,
> format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in1.y4m >/dev/null
> 2>&1 </dev/null &
>
> ffmpeg -i $dir/in0.y4m -i $dir/in1.y4m -filter_complex "psnr=f=-" -f
> null -y /dev/null
>
>
>>>
>>> Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
>>> ---
>>>  libavfilter/avf_showcqt.c          |   7 ++
>>>  libavfilter/avf_showcqt.h          |   3 +
>>>  libavfilter/x86/Makefile           |   2 +
>>>  libavfilter/x86/avf_showcqt.asm    | 206 +++++++++++++++++++++++++++++++++++++
>>>  libavfilter/x86/avf_showcqt_init.c |  63 ++++++++++++
>>>  5 files changed, 281 insertions(+)
>>>  create mode 100644 libavfilter/x86/avf_showcqt.asm
>>>  create mode 100644 libavfilter/x86/avf_showcqt_init.c
>>>
>>> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
>>> index b88c83c..62d5b09 100644
>>> --- a/libavfilter/avf_showcqt.c
>>> +++ b/libavfilter/avf_showcqt.c
>>> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
>>>              w *= sign * (1.0 / s->fft_len);
>>>              s->coeffs[m].val[x - s->coeffs[m].start] = w;
>>>          }
>>> +
>>> +        if (s->permute_coeffs)
>>> +            s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
>>>      }
>>>
>>>      av_expr_free(expr);
>>> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
>>>
>>>      s->cqt_align = 1;
>>>      s->cqt_calc = cqt_calc;
>>> +    s->permute_coeffs = NULL;
>>>      s->draw_sono = draw_sono;
>>>      if (s->format == AV_PIX_FMT_RGB24) {
>>>          s->draw_bar = draw_bar_rgb;
>>> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
>>>          s->update_sono = update_sono_yuv;
>>>      }
>>>
>>> +    if (ARCH_X86)
>>> +        ff_showcqt_init_x86(s);
>>> +
>>>      if ((ret = init_cqt(s)) < 0)
>>>          return ret;
>>>
>>> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
>>> index b945f49..588830f 100644
>>> --- a/libavfilter/avf_showcqt.h
>>> +++ b/libavfilter/avf_showcqt.h
>>> @@ -74,6 +74,7 @@ typedef struct {
>>>      /* callback */
>>>      void                (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
>>>                                      int len, int fft_len);
>>> +    void                (*permute_coeffs)(float *v, int len);
>>>      void                (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
>>>                                      const ColorFloat *c, int bar_h);
>>>      void                (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
>>> @@ -112,4 +113,6 @@ typedef struct {
>>>      int                 axis;
>>>  } ShowCQTContext;
>>>
>>> +void ff_showcqt_init_x86(ShowCQTContext *s);
>>> +
>>>  #endif
>>> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>>> index 4486b79..b6195f8 100644
>>> --- a/libavfilter/x86/Makefile
>>> +++ b/libavfilter/x86/Makefile
>>> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
>>>  OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
>>>  OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
>>>  OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
>>> +OBJS-$(CONFIG_SHOWCQT_FILTER)                += x86/avf_showcqt_init.o
>>>  OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
>>>  OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
>>>  OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
>>> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
>>>  ifdef CONFIG_GPL
>>>  YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER)       += x86/vf_removegrain.o
>>>  endif
>>> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER)           += x86/avf_showcqt.o
>>>  YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
>>>  YASM-OBJS-$(CONFIG_STEREO3D_FILTER)          += x86/vf_stereo3d.o
>>>  YASM-OBJS-$(CONFIG_TBLEND_FILTER)            += x86/vf_blend.o
>>> diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm
>>> new file mode 100644
>>> index 0000000..ba30786
>>> --- /dev/null
>>> +++ b/libavfilter/x86/avf_showcqt.asm
>>> @@ -0,0 +1,206 @@
>>> +;*****************************************************************************
>>> +;* x86-optimized functions for showcqt filter
>>> +;*
>>> +;* Copyright (C) 2016 Muhammad Faiz <mfcc64 at gmail.com>
>>> +;*
>>> +;* This file is part of FFmpeg.
>>> +;*
>>> +;* FFmpeg is free software; you can redistribute it and/or
>>> +;* modify it under the terms of the GNU Lesser General Public
>>> +;* License as published by the Free Software Foundation; either
>>> +;* version 2.1 of the License, or (at your option) any later version.
>>> +;*
>>> +;* FFmpeg is distributed in the hope that it will be useful,
>>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +;* Lesser General Public License for more details.
>>> +;*
>>> +;* You should have received a copy of the GNU Lesser General Public
>>> +;* License along with FFmpeg; if not, write to the Free Software
>>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> +;******************************************************************************
>>> +
>>> +%include "libavutil/x86/x86util.asm"
>>> +
>>> +%if ARCH_X86_64
>>> +%define pointer resq
>>> +%else
>>> +%define pointer resd
>>> +%endif
>>> +
>>> +struc Coeffs
>>> +    .val:   pointer 1
>>> +    .start: resd 1
>>> +    .len:   resd 1
>>> +    .sizeof:
>>> +endstruc
>>> +
>>> +%macro EMULATE_HADDPS 3 ; dst, src, tmp
>>> +%if cpuflag(sse3)
>>> +    haddps  %1, %2
>>> +%else
>>> +    movaps  %3, %1
>>> +    shufps  %1, %2, q2020
>>> +    shufps  %3, %2, q3131
>>> +    addps   %1, %3
>>
>> This is great. Much better and more efficient than other attempts to
>> emulate haddps scattered across the codebase.
>> It also makes me wonder if haddps, a ~5 cycles latency instruction is
>> really better than the combination of a mostly free mov, two 1 cycle
>> shuffles and one 3 cycle add to justify extra functions with it as the
>> only difference, at least in cases where there are no register
>> constrains.
>>
>> Your benchmarks above suggest it is although barely, so I'm curious
>> about what the timer.h macros will show.
>>
> Even probably haddps is slower than shufps+addps on some machine
> But, there is haddps and I use it.
> Updated using SSE3_FAST
>
>>> +%endif
>>> +%endmacro ; EMULATE_HADDPS
>>> +
>>> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
>>> +%if cpuflag(fma3) || cpuflag(fma4)
>>> +    fmaddps %1, %2, %3, %4
>>> +%else
>>> +    mulps   %5, %2, %3
>>> +    addps   %1, %4, %5
>>> +%endif
>>> +%endmacro ; EMULATE_FMADDPS
>>> +
>>> +%macro CQT_CALC 9
>>> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
>>> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
>>> +    mov     id, xd
>>> +    add     id, [coeffsq + Coeffs.start + %9]
>>> +    movaps  m%5, [srcq + 8 * iq]
>>> +    movaps  m%7, [srcq + 8 * iq + mmsize]
>>> +    shufps  m%6, m%5, m%7, q3131
>>> +    shufps  m%5, m%5, m%7, q2020
>>> +    sub     id, fft_lend
>>> +    EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
>>> +    neg     id
>>
>> Is this supposed to turn a positive value negative? If so then it should
>> be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the
>> effective addresses below would be zero.
>>
> This is the intended behavior (i = fft_len - i) evaluated with int not int64_t
>
>>> +    EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
>>> +    movups  m%5, [srcq + 8 * iq - mmsize + 8]
>>> +    movups  m%7, [srcq + 8 * iq - 2*mmsize + 8]
>>> +    %if mmsize == 32
>>> +    vperm2f128 m%5, m%5, m%5, 1
>>> +    vperm2f128 m%7, m%7, m%7, 1
>>> +    %endif
>>> +    shufps  m%6, m%5, m%7, q1313
>>> +    shufps  m%5, m%5, m%7, q0202
>>> +    EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
>>> +    EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
>>> +%endmacro ; CQT_CALC
>>> +
>>> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
>>> +    addps   m%5, m%4, m%2
>>> +    subps   m%6, m%3, m%1
>>> +    addps   m%1, m%3
>>> +    subps   m%2, m%4
>>> +    EMULATE_HADDPS m%5, m%6, m%3
>>> +    EMULATE_HADDPS m%1, m%2, m%3
>>> +    EMULATE_HADDPS m%1, m%5, m%2
>>> +    %if mmsize == 32
>>> +    vextractf128 xmm%2, m%1, 1
>>> +    addps   xmm%1, xmm%2
>>> +    %endif
>>> +%endmacro ; CQT_SEPARATE
>>> +
>>> +%macro DECLARE_CQT_CALC 0
>>> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
>>> +%if ARCH_X86_64
>>> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
>>> +    align   16
>>> +    .loop_k:
>>> +        mov     xd, [coeffsq + Coeffs.len]
>>> +        xorps   m0, m0
>>> +        movaps  m1, m0
>>> +        movaps  m2, m0
>>> +        mov     coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
>>> +        movaps  m3, m0
>>> +        movaps  m8, m0
>>> +        cmp     coeffs_lend, xd
>>> +        movaps  m9, m0
>>> +        movaps  m10, m0
>>> +        movaps  m11, m0
>>> +        cmova   coeffs_lend, xd
>>> +        xor     xd, xd
>>> +        test    coeffs_lend, coeffs_lend
>>> +        jz      .check_loop_b
>>> +        mov     coeffs_valq, [coeffsq + Coeffs.val]
>>> +        mov     coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
>>> +        align   16
>>> +        .loop_ab:
>>> +            movaps  m7, [coeffs_valq + 4 * xq]
>>> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>>> +            movaps  m7, [coeffs_val2q + 4 * xq]
>>> +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
>>> +            add     xd, mmsize/4
>>> +            cmp     xd, coeffs_lend
>>> +            jb      .loop_ab
>>> +        .check_loop_b:
>>> +        cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
>>> +        jae     .check_loop_a
>>> +        align   16
>>> +        .loop_b:
>>> +            movaps  m7, [coeffs_val2q + 4 * xq]
>>> +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
>>> +            add     xd, mmsize/4
>>> +            cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
>>> +            jb      .loop_b
>>> +        .loop_end:
>>> +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
>>> +        CQT_SEPARATE 8, 9, 10, 11, 4, 5
>>> +        mulps   xmm0, xmm0
>>> +        mulps   xmm8, xmm8
>>> +        EMULATE_HADDPS xmm0, xmm8, xmm1
>>> +        movaps  [dstq], xmm0
>>> +        sub     lend, 2
>>> +        lea     dstq, [dstq + 16]
>>
>> Use add
>>
>>> +        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]
>>
>> Same, assuming sizeof is an immediate.
>>
> This is optimization to separate sub and jnz with lea.
> Using add will clobber flag register.
> Also lea does not need rex prefix
>
>>> +        jnz     .loop_k
>>> +        REP_RET
>>> +        align   16
>>> +        .check_loop_a:
>>> +        cmp     xd, [coeffsq + Coeffs.len]
>>> +        jae     .loop_end
>>> +        align   16
>>> +        .loop_a:
>>> +            movaps  m7, [coeffs_valq + 4 * xq]
>>> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>>> +            add     xd, mmsize/4
>>> +            cmp     xd, [coeffsq + Coeffs.len]
>>> +            jb      .loop_a
>>> +        jmp     .loop_end
>>> +%else
>>> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
>>> +%define fft_lend r4m
>>> +    align   16
>>> +    .loop_k:
>>> +        mov     xd, [coeffsq + Coeffs.len]
>>> +        xorps   m0, m0
>>> +        movaps  m1, m0
>>> +        movaps  m2, m0
>>> +        movaps  m3, m0
>>> +        test    xd, xd
>>> +        jz      .store
>>> +        mov     coeffs_valq, [coeffsq + Coeffs.val]
>>> +        xor     xd, xd
>>> +        align   16
>>> +        .loop_x:
>>> +            movaps  m7, [coeffs_valq + 4 * xq]
>>> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>>> +            add     xd, mmsize/4
>>> +            cmp     xd, [coeffsq + Coeffs.len]
>>> +            jb      .loop_x
>>> +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
>>> +        mulps   xmm0, xmm0
>>> +        EMULATE_HADDPS xmm0, xmm0, xmm1
>>> +        .store:
>>> +        movlps  [dstq], xmm0
>>> +        sub     lend, 1
>>> +        lea     dstq, [dstq + 8]
>>> +        lea     coeffsq, [coeffsq + Coeffs.sizeof]
>>
>> Same as above for both of these leas.
>>
> Same answer.
>
>>> +        jnz     .loop_k
>>> +        REP_RET
>>> +%endif ; ARCH_X86_64
>>> +%endmacro ; DECLARE_CQT_CALC
>>> +
>>> +INIT_XMM sse
>>> +DECLARE_CQT_CALC
>>> +INIT_XMM sse3
>>> +DECLARE_CQT_CALC
>>> +INIT_YMM avx
>>> +DECLARE_CQT_CALC
>>> +INIT_YMM fma3
>>> +DECLARE_CQT_CALC
>>> +INIT_YMM fma4
>>
>> All CPUs supporting FMA4 underperform in functions using ymm registers.
>> Make it xmm instead.
>>
> OK, updated
>
>>> +DECLARE_CQT_CALC
>>> diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c
>>> new file mode 100644
>>> index 0000000..664c6ac
>>> --- /dev/null
>>> +++ b/libavfilter/x86/avf_showcqt_init.c
>>> @@ -0,0 +1,63 @@
>>> +/*
>>> + * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/attributes.h"
>>> +#include "libavutil/cpu.h"
>>> +#include "libavutil/x86/cpu.h"
>>> +#include "libavfilter/avf_showcqt.h"
>>> +
>>> +#define DECLARE_CQT_CALC(type) \
>>> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
>>> +                                const Coeffs *coeffs, int len, int fft_len)
>>> +
>>> +DECLARE_CQT_CALC(sse);
>>> +DECLARE_CQT_CALC(sse3);
>>> +DECLARE_CQT_CALC(avx);
>>> +DECLARE_CQT_CALC(fma3);
>>> +DECLARE_CQT_CALC(fma4);
>>> +
>>> +#define permute_coeffs_0 NULL
>>> +
>>> +static void permute_coeffs_01452367(float *v, int len)
>>> +{
>>> +    int k;
>>> +    for (k = 0; k < len; k += 8) {
>>> +        FFSWAP(float, v[k+2], v[k+4]);
>>> +        FFSWAP(float, v[k+3], v[k+5]);
>>> +    }
>>> +}
>>> +
>>> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
>>> +{
>>> +    int cpuflags = av_get_cpu_flags();
>>> +
>>> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \
>>> +if (EXTERNAL_##TYPE(cpuflags)) { \
>>> +    s->cqt_calc = ff_showcqt_cqt_calc_##type; \
>>> +    s->cqt_align = align; \
>>> +    s->permute_coeffs = permute_coeffs_##perm; \
>>> +}
>>> +
>>> +    SELECT_CQT_CALC(sse,  SSE,  4, 0);
>>> +    SELECT_CQT_CALC(sse3, SSE3, 4, 0);
>>> +    SELECT_CQT_CALC(avx,  AVX,  8, 01452367);
>>
>> Use AVX_FAST, so this function will not be used on CPUs that set the
>> AV_CPU_FLAG_AVXSLOW flag.
>>
>>> +    SELECT_CQT_CALC(fma3, FMA3, 8, 01452367);
>>
>> Same, use FMA3_FAST. The result will then be the FMA3 version used by
>> Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based
>> CPUs.
>>
>>> +    SELECT_CQT_CALC(fma4, FMA4, 8, 01452367);
>>> +}
>>>
>>
>
> OK, also reorder (FMA4 before AVX because AVX/ymm without FMA4 is faster than
> FMA4/xmm)
>
> Modified patch attached
>
> Thank's

applied with modified message

Thank's