[FFmpeg-devel] [PATCH v2] mdct15: add assembly optimizations for the 15-point FFT
James Almer
jamrial at gmail.com
Fri Jun 23 01:05:22 EEST 2017
On 6/22/2017 5:40 PM, Rostislav Pehlivanov wrote:
> c: 1802 decicycles in fft15,16774635 runs, 2581 skips
> fma3: 935 decicycles in fft15,16775893 runs, 1323 skips
> ---
> libavcodec/mdct15.c | 186 ++++++++++++++++++++++---------------------
> libavcodec/mdct15.h | 26 +++---
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/mdct15.asm | 148 ++++++++++++++++++++++++++++++++++
> libavcodec/x86/mdct15_init.c | 91 +++++++++++++++++++++
> 5 files changed, 344 insertions(+), 109 deletions(-)
> create mode 100644 libavcodec/x86/mdct15.asm
> create mode 100644 libavcodec/x86/mdct15_init.c
>
> diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
> index 8c42ece483..6eb428c250 100644
> --- a/libavcodec/mdct15.c
> +++ b/libavcodec/mdct15.c
> @@ -57,11 +57,6 @@ av_cold void ff_mdct15_uninit(MDCT15Context **ps)
> av_freep(ps);
> }
>
> -static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
> -
> -static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> - ptrdiff_t stride, float scale);
> -
> static inline int init_pfa_reindex_tabs(MDCT15Context *s)
> {
> int i, j;
> @@ -93,88 +88,8 @@ static inline int init_pfa_reindex_tabs(MDCT15Context *s)
> return 0;
> }
>
> -av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
> -{
> - MDCT15Context *s;
> - double alpha, theta;
> - int len2 = 15 * (1 << N);
> - int len = 2 * len2;
> - int i;
> -
> - /* Tested and verified to work on everything in between */
> - if ((N < 2) || (N > 13))
> - return AVERROR(EINVAL);
> -
> - s = av_mallocz(sizeof(*s));
> - if (!s)
> - return AVERROR(ENOMEM);
> -
> - s->fft_n = N - 1;
> - s->len4 = len2 / 2;
> - s->len2 = len2;
> - s->inverse = inverse;
> - s->mdct = mdct15;
> - s->imdct_half = imdct15_half;
> -
> - if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> - goto fail;
> -
> - if (init_pfa_reindex_tabs(s))
> - goto fail;
> -
> - s->tmp = av_malloc_array(len, 2 * sizeof(*s->tmp));
> - if (!s->tmp)
> - goto fail;
> -
> - s->twiddle_exptab = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
> - if (!s->twiddle_exptab)
> - goto fail;
> -
> - theta = 0.125f + (scale < 0 ? s->len4 : 0);
> - scale = sqrt(fabs(scale));
> - for (i = 0; i < s->len4; i++) {
> - alpha = 2 * M_PI * (i + theta) / len;
> - s->twiddle_exptab[i].re = cos(alpha) * scale;
> - s->twiddle_exptab[i].im = sin(alpha) * scale;
> - }
> -
> - /* 15-point FFT exptab */
> - for (i = 0; i < 19; i++) {
> - if (i < 15) {
> - double theta = (2.0f * M_PI * i) / 15.0f;
> - if (!s->inverse)
> - theta *= -1;
> - s->exptab[i].re = cos(theta);
> - s->exptab[i].im = sin(theta);
> - } else { /* Wrap around to simplify fft15 */
> - s->exptab[i] = s->exptab[i - 15];
> - }
> - }
> -
> - /* 5-point FFT exptab */
> - s->exptab[19].re = cos(2.0f * M_PI / 5.0f);
> - s->exptab[19].im = sin(2.0f * M_PI / 5.0f);
> - s->exptab[20].re = cos(1.0f * M_PI / 5.0f);
> - s->exptab[20].im = sin(1.0f * M_PI / 5.0f);
> -
> - /* Invert the phase for an inverse transform, do nothing for a forward transform */
> - if (s->inverse) {
> - s->exptab[19].im *= -1;
> - s->exptab[20].im *= -1;
> - }
> -
> - *ps = s;
> -
> - return 0;
> -
> -fail:
> - ff_mdct15_uninit(&s);
> - return AVERROR(ENOMEM);
> -}
> -
> /* Stride is hardcoded to 3 */
> -static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
> - const FFTComplex *in)
> +static inline void fft5(FFTComplex *out, FFTComplex *in, FFTComplex exptab[2])
> {
> FFTComplex z0[4], t[6];
>
> @@ -219,14 +134,14 @@ static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
> out[4].im = in[0].im + z0[3].im;
> }
>
> -static void fft15(const FFTComplex exptab[22], FFTComplex *out, const FFTComplex *in, size_t stride)
> +static void fft15_c(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride)
> {
> int k;
> FFTComplex tmp1[5], tmp2[5], tmp3[5];
>
> - fft5(exptab + 19, tmp1, in + 0);
> - fft5(exptab + 19, tmp2, in + 1);
> - fft5(exptab + 19, tmp3, in + 2);
> + fft5(tmp1, in + 0, exptab + 19);
> + fft5(tmp2, in + 1, exptab + 19);
> + fft5(tmp3, in + 2, exptab + 19);
>
> for (k = 0; k < 5; k++) {
> FFTComplex t[2];
> @@ -269,7 +184,7 @@ static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
> }
> CMUL(fft15in[j].re, fft15in[j].im, re, im, s->twiddle_exptab[k].re, -s->twiddle_exptab[k].im);
> }
> - fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> + s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
> }
>
> /* Then a 15xN FFT (where N is a power of two) */
> @@ -291,6 +206,8 @@ static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
> }
> }
>
> +#include "libavutil/timer.h"
Remove this.
> +
> static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> ptrdiff_t stride, float scale)
> {
> @@ -306,7 +223,9 @@ static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> FFTComplex tmp = { *(in2 - 2*k*stride), *(in1 + 2*k*stride) };
> CMUL3(fft15in[j], tmp, s->twiddle_exptab[k]);
> }
> - fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> + START_TIMER("fft15");
> + s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
> + STOP_TIMER("fft15");
Also this, of course.
> }
>
> /* Then a 15xN FFT (where N is a power of two) */
> @@ -327,3 +246,86 @@ static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> z[i0].im = scale * im1;
> }
> }
> +
> +av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
> +{
> + MDCT15Context *s;
> + double alpha, theta;
> + int len2 = 15 * (1 << N);
> + int len = 2 * len2;
> + int i;
> +
> + /* Tested and verified to work on everything in between */
> + if ((N < 2) || (N > 13))
> + return AVERROR(EINVAL);
> +
> + s = av_mallocz(sizeof(*s));
> + if (!s)
> + return AVERROR(ENOMEM);
> +
> + s->fft_n = N - 1;
> + s->len4 = len2 / 2;
> + s->len2 = len2;
> + s->inverse = inverse;
> + s->fft15 = fft15_c;
> + s->mdct = mdct15;
> + s->imdct_half = imdct15_half;
> +
> + if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> + goto fail;
> +
> + if (init_pfa_reindex_tabs(s))
> + goto fail;
> +
> + s->tmp = av_malloc_array(len, 2 * sizeof(*s->tmp));
> + if (!s->tmp)
> + goto fail;
> +
> + s->twiddle_exptab = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
> + if (!s->twiddle_exptab)
> + goto fail;
> +
> + theta = 0.125f + (scale < 0 ? s->len4 : 0);
> + scale = sqrt(fabs(scale));
> + for (i = 0; i < s->len4; i++) {
> + alpha = 2 * M_PI * (i + theta) / len;
> + s->twiddle_exptab[i].re = cosf(alpha) * scale;
> + s->twiddle_exptab[i].im = sinf(alpha) * scale;
> + }
> +
> + /* 15-point FFT exptab */
> + for (i = 0; i < 19; i++) {
> + if (i < 15) {
> + double theta = (2.0f * M_PI * i) / 15.0f;
> + if (!s->inverse)
> + theta *= -1;
> + s->exptab[i].re = cosf(theta);
> + s->exptab[i].im = sinf(theta);
> + } else { /* Wrap around to simplify fft15 */
> + s->exptab[i] = s->exptab[i - 15];
> + }
> + }
> +
> + /* 5-point FFT exptab */
> + s->exptab[19].re = cosf(2.0f * M_PI / 5.0f);
> + s->exptab[19].im = sinf(2.0f * M_PI / 5.0f);
> + s->exptab[20].re = cosf(1.0f * M_PI / 5.0f);
> + s->exptab[20].im = sinf(1.0f * M_PI / 5.0f);
> +
> + /* Invert the phase for an inverse transform, do nothing for a forward transform */
> + if (s->inverse) {
> + s->exptab[19].im *= -1;
> + s->exptab[20].im *= -1;
> + }
> +
> + if (ARCH_X86)
> + ff_mdct15_init_x86(s);
> +
> + *ps = s;
> +
> + return 0;
> +
> +fail:
> + ff_mdct15_uninit(&s);
> + return AVERROR(ENOMEM);
> +}
> diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
> index ef94edff6c..7d83f3ebdf 100644
> --- a/libavcodec/mdct15.h
> +++ b/libavcodec/mdct15.h
> @@ -34,34 +34,26 @@ typedef struct MDCT15Context {
> int *pfa_postreindex;
>
> FFTContext ptwo_fft;
> -
> FFTComplex *tmp;
> -
> FFTComplex *twiddle_exptab;
>
> - /* 0 - 18: fft15 twiddles, 19 - 20: fft5 twiddles */
> - FFTComplex exptab[21];
> + DECLARE_ALIGNED(32, FFTComplex, exptab)[64];
>
> - /**
> - * Calculate a full 2N -> N MDCT
> - */
> + /* 15-point FFT */
> + void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
> +
> + /* Calculate a full 2N -> N MDCT */
> void (*mdct)(struct MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
>
> - /**
> - * Calculate the middle half of the iMDCT
> - */
> + /* Calculate the middle half of the iMDCT */
> void (*imdct_half)(struct MDCT15Context *s, float *dst, const float *src,
> ptrdiff_t src_stride, float scale);
> } MDCT15Context;
>
> -/**
> - * Init an (i)MDCT of the length 2 * 15 * (2^N)
> - */
> +/* Init an (i)MDCT of the length 2 * 15 * (2^N) */
> int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
> -
> -/**
> - * Frees a context
> - */
> void ff_mdct15_uninit(MDCT15Context **ps);
>
> +void ff_mdct15_init_x86(MDCT15Context *s);
> +
> #endif /* AVCODEC_MDCT15_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index e65118d134..7cfcc8d96a 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -25,6 +25,7 @@ OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
> OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
> OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
> OBJS-$(CONFIG_LPC) += x86/lpc.o
> +OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
> OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
> OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
> OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
> @@ -158,6 +159,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
> x86/hevc_sao.o \
> x86/hevc_sao_10bit.o
> X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
> +X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
Again, move it to the subsystem section.
> X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
> X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
> X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
> diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
> new file mode 100644
> index 0000000000..ad39365b5b
> --- /dev/null
> +++ b/libavcodec/x86/mdct15.asm
> @@ -0,0 +1,148 @@
> +;******************************************************************************
> +;* SIMD optimized non-power-of-two MDCT functions
> +;*
> +;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32
No need to make it 32 aligned anymore.
> +
> +sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
> +
> +SECTION .text
> +
> +; %1 %2 %3 %4 %5 %6 %7
> +%macro FFT5 7 ; src, offset, dst1 (64bit used), dst2, exptab1, exptab2, signadjust (uses m0-m4)
> + movddup xm0, [%1q + 0*16 + 0 + %2] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
> + movsd xm1, [%1q + 1*16 + 8 + %2] ; in[ 3].re, in[ 3].im, 0, 0
> + movups xm2, [%1q + 2*16 + 8 + %2] ; in[ 5].re, in[ 5].im, in[ 6].re, in[ 6].im
> + movups xm3, [%1q + 4*16 + 0 + %2] ; in[ 8].re, in[ 8].im, in[ 9].re, in[ 9].im
Make these two movsd (Adding eight bytes to the offset)...
> + movsd xm4, [%1q + 6*16 + 0 + %2] ; in[12].re, in[12].im, 0, 0
> +
> + vinsertf128 m0, xm0, 1
> +
> + shufps xm1, xm2, q3210 ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
> + shufps xm4, xm3, q3210 ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
...and these q1010. It makes the function about 20 decicycles faster for me.
> +
> + subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
> + addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
> +
> + movhlps %3, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
> + addps %3, xm1
> + addps %3, xm0 ; DC[0].re, DC[0].im, junk...
> + movlhps %3, %3 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
> +
> + shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
> + shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
> +
> + mulps xm%4, xm1, %5
> + mulps xm4, xm3, %6
> + mulps xm1, %6
> +
> + xorps xm1, %7
> + fmaddsubps xm3, xm3, %5, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
This is the only remaining fma instruction. At this point i assume it
alone is not going to make the function measurably faster than just
using mulps + addsubps.
IMO, just replace it and make the function AVX. That way more CPUs will
be able to run it (SandyBridge/Bulldozer onward, instead of
Haswell/PileDriver).
> + subps xm%4, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
> +
> + movhlps xm2, xm%4, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
> + movlhps xm3, xm%4 ; t[0].re, t[0].im, t[4].re, t[4].im
> +
> + xorps xm2, %7
> + addps xm%4, xm2, xm3
> + subps xm3, xm2
> +
> + shufps xm3, xm3, q1032
> + vinsertf128 m%4, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
> + addps m%4, m0 ; Finally offset with DCs
> +%endmacro
> +
> +; %1 %2 %3 %4 %5 %6
> +%macro BUTTERFLIES_DC 6 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m1)
> + movaps m0, [%1q + %2]
> + vextractf128 xm1, m0, 1
> +
> + mulps xm1, %5
> + mulps xm0, %4
> +
> + haddps xm0, xm1
> + movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
> +
> + addps xm0, xm1
> + addps xm0, %3
> +
> + movsd [%6q], xm0
> +%endmacro
> +
> +; %1 %2 %3 %4 %5 %6
> +%macro BUTTERFLIES_AC 6 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m3)
> + mulps m0, %4, [%1q + 64*0 + 0*mmsize + %2]
> + mulps m1, %4, [%1q + 64*0 + 1*mmsize + %2]
> + mulps m2, %5, [%1q + 64*1 + 0*mmsize + %2]
> + mulps m3, %5, [%1q + 64*1 + 1*mmsize + %2]
> +
> + shufps m1, m1, q2301
> + shufps m3, m3, q2301
> +
> + addps m0, m1
> + addps m2, m3
> + addps m0, m2
> + addps m0, %3
> +
> + vextractf128 xm1, m0, 1
> +
> + movlps [%6q + strideq*1], xm0
> + movhps [%6q + strideq*2], xm0
> + movlps [%6q + stride3q], xm1
> + movhps [%6q + strideq*4], xm1
> +%endmacro
> +
> +;******************************************************************************************
> +;void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
> +;******************************************************************************************
> +INIT_YMM fma3
> +cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
> +%define out0q inq
> + shl strideq, 3
> +
> + movaps m5, [exptabq + 480]
> + vextractf128 xm6, m5, 1
> + movaps xm7, [sign_adjust_5]
> +
> + FFT5 in, 0, xm8, 11, xm5, xm6, xm7
> + FFT5 in, 8, xm9, 12, xm5, xm6, xm7
> + FFT5 in, 16, xm10, 13, xm5, xm6, xm7
The first and last three arguments never change, so why not just
hardcoding them in the macro?
> +
> + lea stride3q, [strideq + strideq*2]
> + lea stride5q, [stride3q + strideq*2]
> +
> + mov out0q, outq
> +
> + BUTTERFLIES_DC exptab, (8*6 + 4*0)*2*4, xm8, xm9, xm10, out0
> + lea outq, [out0q + stride5q*1]
> + BUTTERFLIES_DC exptab, (8*6 + 4*1)*2*4, xm8, xm9, xm10, out
> + lea outq, [out0q + stride5q*2]
> + BUTTERFLIES_DC exptab, (8*6 + 4*2)*2*4, xm8, xm9, xm10, out
> +
> + BUTTERFLIES_AC exptab, (8*0)*2*4, m11, m12, m13, out0
> + lea outq, [out0q + stride5q*1]
> + BUTTERFLIES_AC exptab, (8*2)*2*4, m11, m12, m13, out
> + lea outq, [out0q + stride5q*2]
> + BUTTERFLIES_AC exptab, (8*4)*2*4, m11, m12, m13, out
Same here.
> +
> + RET
> diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
> new file mode 100644
> index 0000000000..39106d0047
> --- /dev/null
> +++ b/libavcodec/x86/mdct15_init.c
> @@ -0,0 +1,91 @@
> +/*
> + * SIMD optimized non-power-of-two MDCT functions
> + *
> + * Copyright (C) 2017 Rostislav Pehlivanov <atomnuker at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/mdct15.h"
> +
> +void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
> +
> +static void perm_twiddles(MDCT15Context *s)
> +{
> + int k;
> +
> + FFTComplex tmp[21], tmp2[30];
> + memcpy(tmp, s->exptab, sizeof(FFTComplex)*21);
> +
> + /* 15-point FFT twiddles */
> + for (k = 0; k < 5; k++) {
> + tmp2[6*k + 0] = tmp[k + 0];
> + tmp2[6*k + 2] = tmp[k + 5];
> + tmp2[6*k + 4] = tmp[k + 10];
> +
> + tmp2[6*k + 1] = tmp[2 * (k + 0)];
> + tmp2[6*k + 3] = tmp[2 * (k + 5)];
> + tmp2[6*k + 5] = tmp[2 * k + 5 ];
> + }
> +
> + for (k = 0; k < 6; k++) {
> + FFTComplex ac_exp[] = {
> + { tmp2[6*1 + k].re, tmp2[6*1 + k].re },
> + { tmp2[6*2 + k].re, tmp2[6*2 + k].re },
> + { tmp2[6*3 + k].re, tmp2[6*3 + k].re },
> + { tmp2[6*4 + k].re, tmp2[6*4 + k].re },
> + { tmp2[6*1 + k].im, -tmp2[6*1 + k].im },
> + { tmp2[6*2 + k].im, -tmp2[6*2 + k].im },
> + { tmp2[6*3 + k].im, -tmp2[6*3 + k].im },
> + { tmp2[6*4 + k].im, -tmp2[6*4 + k].im },
> + };
> + memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
> + }
> +
> + /* Specialcase when k = 0 */
> + for (k = 0; k < 3; k++) {
> + FFTComplex dc_exp[] = {
> + { tmp2[2*k + 0].re, -tmp2[2*k + 0].im },
> + { tmp2[2*k + 0].im, tmp2[2*k + 0].re },
> + { tmp2[2*k + 1].re, -tmp2[2*k + 1].im },
> + { tmp2[2*k + 1].im, tmp2[2*k + 1].re },
> + };
> + memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
> + }
> +
> + /* 5-point FFT twiddles */
> + FFTComplex exp_5point[] = {
/ffmpeg/src/libavcodec/x86/mdct15_init.c:74:5: warning: ISO C90 forbids
mixed declarations and code [-Wdeclaration-after-statement]
FFTComplex exp_5point[] = {
^~~~~~~~~~
> + { tmp[19].re, tmp[19].re },
> + { tmp[19].im, tmp[19].im },
> + { tmp[20].re, tmp[20].re },
> + { tmp[20].im, tmp[20].im },
> + };
> + memcpy(s->exptab + 8*6 + 4*3, exp_5point, 4*sizeof(FFTComplex));
> +}
> +
> +av_cold void ff_mdct15_init_x86(MDCT15Context *s)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (ARCH_X86_64 && EXTERNAL_FMA3(cpu_flags)) {
> + perm_twiddles(s);
perm_twiddles() should be called once. If more versions of fft15 were to
be added, you'll have to find a way to do that.
> + s->fft15 = ff_fft15_fma3;
> + }
> +}
>
More information about the ffmpeg-devel
mailing list