[FFmpeg-devel] [PATCH] mdct15: simplify the fft15 x86 SIMD
Rostislav Pehlivanov
atomnuker at gmail.com
Tue May 8 01:47:40 EEST 2018
On 6 May 2018 at 23:19, Rostislav Pehlivanov <atomnuker at gmail.com> wrote:
> Saves 1 gpr and 2 instructions and simplifies the macros a bit.
>
> Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
> ---
> libavcodec/x86/mdct15.asm | 37 +++++++++++++++++--------------------
> 1 file changed, 17 insertions(+), 20 deletions(-)
>
> diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
> index 0309112538..2a2cdbd21b 100644
> --- a/libavcodec/x86/mdct15.asm
> +++ b/libavcodec/x86/mdct15.asm
> @@ -76,7 +76,7 @@ SECTION .text
> addps m%3, m%3, m0 ; Finally offset with DCs
> %endmacro
>
> -%macro BUTTERFLIES_DC 2 ; %1 - exptab_offset, %2 - out
> +%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
> mulps xm0, xm9, [exptabq + %1 + 16*0]
> mulps xm1, xm10, [exptabq + %1 + 16*1]
>
> @@ -86,10 +86,10 @@ SECTION .text
> addps xm0, xm1
> addps xm0, xm8
>
> - movsd [%2q], xm0
> + movsd [outq], xm0
> %endmacro
>
> -%macro BUTTERFLIES_AC 2 ; exptab, exptab_offset, src1, src2, src3, out
> (uses m0-m3)
> +%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
> mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
> mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
> mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
> @@ -104,15 +104,14 @@ SECTION .text
>
> vextractf128 xm1, m0, 1
>
> - movlps [%2q + strideq*1], xm0
> - movhps [%2q + strideq*2], xm0
> - movlps [%2q + stride3q], xm1
> - movhps [%2q + strideq*4], xm1
> + movlps [outq + strideq*1], xm0
> + movhps [outq + strideq*2], xm0
> + movlps [outq + stride3q], xm1
> + movhps [outq + strideq*4], xm1
> %endmacro
>
> INIT_YMM avx
> -cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
> -%define out0q inq
> +cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
> shl strideq, 3
>
> movaps xm5, [exptabq + 480 + 16*0]
> @@ -123,22 +122,20 @@ cglobal fft15, 4, 6, 14, out, in, exptab, stride,
> stride3, stride5
> FFT5 8, xm9, 12
> FFT5 16, xm10, 13
>
> +%define stride3q inq
> lea stride3q, [strideq + strideq*2]
> lea stride5q, [strideq + strideq*4]
>
> - mov out0q, outq
> + BUTTERFLIES_DC (8*6 + 4*0)*2*4
> + BUTTERFLIES_AC (8*0 + 0*0)*2*4
>
> - BUTTERFLIES_DC (8*6 + 4*0)*2*4, out0
> - lea outq, [out0q + stride5q*1]
> - BUTTERFLIES_DC (8*6 + 4*1)*2*4, out
> - lea outq, [out0q + stride5q*2]
> - BUTTERFLIES_DC (8*6 + 4*2)*2*4, out
> + add outq, stride5q
> + BUTTERFLIES_DC (8*6 + 4*1)*2*4
> + BUTTERFLIES_AC (8*2 + 0*0)*2*4
>
> - BUTTERFLIES_AC (8*0)*2*4, out0
> - lea outq, [out0q + stride5q*1]
> - BUTTERFLIES_AC (8*2)*2*4, out
> - lea outq, [out0q + stride5q*2]
> - BUTTERFLIES_AC (8*4)*2*4, out
> + add outq, stride5q
> + BUTTERFLIES_DC (8*6 + 4*2)*2*4
> + BUTTERFLIES_AC (8*4 + 0*0)*2*4
>
> RET
>
> --
> 2.17.0
>
>
Pushed alongside a patch to simplify the init function
More information about the ffmpeg-devel
mailing list