[FFmpeg-devel] [PATCH 3/4] mips: Optimization of fixed point FFT
Nedeljko Babic
nbabic at mips.com
Tue Sep 25 16:10:56 CEST 2012
Signed-off-by: Nedeljko Babic <nbabic at mips.com>
---
libavcodec/fft.c | 1 +
libavcodec/fft.h | 1 +
libavcodec/fft_ac3_fixed.c | 4 +-
libavcodec/mips/Makefile | 1 +
libavcodec/mips/fft_mips_fixed.c | 562 ++++++++++++++++++++++++++++++++++++++
5 files changed, 568 insertions(+), 1 deletions(-)
create mode 100644 libavcodec/mips/fft_mips_fixed.c
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index e5bdcbd..a6ce1db 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -166,6 +166,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
#else
if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c;
if (ARCH_ARM) ff_fft_fixed_init_arm(s);
+ if (HAVE_MIPSDSPR2) ff_fft_fixed_init_mips(s);
#endif
for(j=4; j<=nbits; j++) {
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index c7d2cfb..01ac4f9 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -143,6 +143,7 @@ void ff_fft_init_mips(FFTContext *s);
#else
void ff_fft_fixed_init_arm(FFTContext *s);
void ff_ac3_fft_init_fixed(FFTContext *s);
+void ff_fft_fixed_init_mips(FFTContext *s);
#endif
void ff_fft_end(FFTContext *s);
diff --git a/libavcodec/fft_ac3_fixed.c b/libavcodec/fft_ac3_fixed.c
index 2796cb5..53968af 100644
--- a/libavcodec/fft_ac3_fixed.c
+++ b/libavcodec/fft_ac3_fixed.c
@@ -256,10 +256,12 @@ static void ff_fft_fixed_calc_mips(FFTContext *s, FFTComplex *z) {
void ff_ac3_fft_init_fixed(FFTContext *s) {
int n=0;
- ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
#if CONFIG_MDCT
s->imdct_half_fixed = ff_imdct_fixed_half_mips;
#endif /* CONFIG_MDCT */
s->fft_fixed_calc = ff_fft_fixed_calc_mips;
+
+ if (HAVE_MIPSDSPR2) ff_fft_fixed_init_mips(s);
+ else ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
}
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 17f6b13..2e6f4b7 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -15,6 +15,7 @@ MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o
MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o
OBJS-$(CONFIG_FFT) += mips/fft_init_table.o
MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o
+MIPSDSPR2-OBJS-$(CONFIG_FFT) += mips/fft_mips_fixed.o
MIPSFPU-OBJS-$(HAVE_INLINE_ASM) += mips/fmtconvert_mips.o
OBJS-$(HAVE_INLINE_ASM) += mips/dsputil_mips.o
MIPSDSPR1-OBJS-$(HAVE_INLINE_ASM) += mips/fmtconvert_mips_fixed.o
diff --git a/libavcodec/mips/fft_mips_fixed.c b/libavcodec/mips/fft_mips_fixed.c
new file mode 100644
index 0000000..299119a
--- /dev/null
+++ b/libavcodec/mips/fft_mips_fixed.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2012
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors: Stanislav Ocovaj (socovaj at mips.com)
+ * Dragan Mrdjan (dmrdjan at mips.com)
+ * Zoran Lukic (zlukic at mips.com)
+ * Bojan Zivkovic (bojan at mips.com)
+ *
+ * Optimization of FFT and MDCT/IMDCT transforms for MIPS fixed-point
+ * architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CONFIG_FFT_FLOAT 0
+#include "libavcodec/fft.h"
+#include "libavcodec/mips/fft_table.h"
+
+#if HAVE_INLINE_ASM
+static void ff_imdct_fixed_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k, n8, n4, n2, n, j, j2;
+ int ax0, ax1, ax2, ax3;
+ const uint16_t *revtab = s->revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ const FFTSample *in1, *in2, *in3, *in4;
+ FFTComplex *z = (FFTComplex *)output;
+
+ FFTSample t0, t1, t2, t3, t01, t11, t21, t31;
+
+ n = 1 << s->mdct_bits;
+ n2 = n >> 1;
+ n4 = n >> 2;
+ n8 = n >> 3;
+
+ /* pre rotation */
+ in1 = input;
+ in3 = input + 2;
+ in2 = input + n2 - 1;
+ in4 = input + n2 - 3;
+
+ for(k=0; k<n4; k+=4) {
+ int k1 = k * 2;
+ int k2 = k1 + 2;
+
+ __asm__ volatile (
+ "lh %[ax0], 0(%[in2]) \n\t"
+ "lh %[ax1], 0(%[in1]) \n\t"
+ "lhx %[ax2], %[k1](%[tcos]) \n\t"
+ "lhx %[ax3], %[k1](%[tsin]) \n\t"
+ "multu $ac0, $0, $0 \n\t"
+ "multu $ac1, $0, $0 \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "multu $ac2, $0, $0 \n\t"
+ "mulsaq_s.w.ph $ac0, %[ax0], %[ax2] \n\t"
+ "dpaqx_s.w.ph $ac1, %[ax0], %[ax2] \n\t"
+ "lh %[ax0], -4(%[in2]) \n\t"
+ "lh %[ax1], 4(%[in1]) \n\t"
+ "lhx %[ax2], %[k2](%[tcos]) \n\t"
+ "lhx %[ax3], %[k2](%[tsin]) \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "mulsaq_s.w.ph $ac2, %[ax0], %[ax2] \n\t"
+ "multu $ac3, $0, $0 \n\t"
+ "dpaqx_s.w.ph $ac3, %[ax0], %[ax2] \n\t"
+ "extr_r.w %[t0], $ac0, 16 \n\t"
+ "extr_r.w %[t2], $ac1, 16 \n\t"
+ "extr_r.w %[t1], $ac2, 16 \n\t"
+ "extr_r.w %[t3], $ac3, 16 \n\t"
+
+ : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2),[ax1] "=&r" (ax1), [ax3] "=&r" (ax3),
+ [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+ : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
+ [tsin] "r" (tsin), [k1] "r" (k1), [k2] "r" (k2)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+ "$ac3hi", "$ac3lo"
+ );
+
+ j = revtab[k];
+ j2 = revtab[k+1];
+
+ z[j].re = t0;
+ z[j].im = t2;
+ z[j2].re = t1;
+ z[j2].im = t3;
+
+ k1 += 4;
+ k2 += 4;
+
+ __asm__ volatile (
+ "lh %[ax0], -8(%[in2]) \n\t"
+ "lh %[ax1], 8(%[in1]) \n\t"
+ "lhx %[ax2], %[k1](%[tcos]) \n\t"
+ "lhx %[ax3], %[k1](%[tsin]) \n\t"
+ "multu $ac0, $0, $0 \n\t"
+ "multu $ac1, $0, $0 \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "multu $ac2, $0, $0 \n\t"
+ "mulsaq_s.w.ph $ac0, %[ax0], %[ax2] \n\t"
+ "dpaqx_s.w.ph $ac1, %[ax0], %[ax2] \n\t"
+ "lh %[ax0], -12(%[in2]) \n\t"
+ "lh %[ax1], 12(%[in1]) \n\t"
+ "lhx %[ax2], %[k2](%[tcos]) \n\t"
+ "lhx %[ax3], %[k2](%[tsin]) \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "mulsaq_s.w.ph $ac2, %[ax0], %[ax2] \n\t"
+ "multu $ac3, $0, $0 \n\t"
+ "dpaqx_s.w.ph $ac3, %[ax0], %[ax2] \n\t"
+ "extr_r.w %[t0], $ac0, 16 \n\t"
+ "extr_r.w %[t2], $ac1, 16 \n\t"
+ "extr_r.w %[t1], $ac2, 16 \n\t"
+ "extr_r.w %[t3], $ac3, 16 \n\t"
+
+ : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2), [ax1] "=&r" (ax1), [ax3] "=&r" (ax3),
+ [t0] "=&r" (t0), [t2] "=&r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
+ : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
+ [tsin] "r" (tsin),[k1] "r" (k1), [k2] "r" (k2)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+ "$ac3hi", "$ac3lo"
+ );
+
+ j = revtab[k+2];
+ j2 = revtab[k+3];
+
+ z[j ].re = t0;
+ z[j ].im = t2;
+ z[j2].re = t1;
+ z[j2].im = t3;
+ in1 += 8;
+ in2 -= 8;
+ }
+
+ s->fft_fixed_calc(s, z);
+
+ /* post rotation + reordering */
+
+ for(k=0; k<n8; k+=2 ) {
+ int k1 = 2 * (n8 - k - 1), k2 = k1 - 2;
+ int k11 = 2 * (n8 + k), k21 = k11 + 2;
+ in1 = (const FFTSample*)(z + (n8 - k - 1));
+ in2 = (const FFTSample*)(z + (n8 + k));
+
+ __asm__ volatile (
+ "lh %[ax0], 2(%[in1]) \n\t"
+ "lh %[ax1], 0(%[in1]) \n\t"
+ "lhx %[ax2], %[k1](%[tsin]) \n\t"
+ "lhx %[ax3], %[k1](%[tcos]) \n\t"
+ "multu $ac0, $0, $0 \n\t"
+ "multu $ac1, $0, $0 \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "mulsaq_s.w.ph $ac0, %[ax0], %[ax2] \n\t"
+ "dpaqx_s.w.ph $ac1, %[ax0], %[ax2] \n\t"
+ "lh %[ax0], -2(%[in1]) \n\t"
+ "lh %[ax1], -4(%[in1]) \n\t"
+ "lhx %[ax2], %[k2](%[tsin]) \n\t"
+ "lhx %[ax3], %[k2](%[tcos]) \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "multu $ac2, $0, $0 \n\t"
+ "mulsaq_s.w.ph $ac2, %[ax0], %[ax2] \n\t"
+ "multu $ac3, $0, $0 \n\t"
+ "dpaqx_s.w.ph $ac3, %[ax0], %[ax2] \n\t"
+ "extr_r.w %[t0], $ac0, 16 \n\t"
+ "extr_r.w %[t2], $ac1, 16 \n\t"
+ "extr_r.w %[t1], $ac2, 16 \n\t"
+ "extr_r.w %[t3], $ac3, 16 \n\t"
+
+ : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
+ [t0] "=r" (t0), [t2] "=r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
+ : [in1] "r" (in1), [k1] "r" (k1), [tsin] "r" (tsin), [tcos] "r" (tcos),
+ [z] "r" (z), [k2] "r" (k2)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+ "$ac3hi", "$ac3lo"
+ );
+
+ __asm__ volatile (
+ "lh %[ax0], 2(%[in2]) \n\t"
+ "lh %[ax1], 0(%[in2]) \n\t"
+ "lhx %[ax2], %[k11](%[tsin]) \n\t"
+ "lhx %[ax3], %[k11](%[tcos]) \n\t"
+ "multu $ac0, $0, $0 \n\t"
+ "multu $ac1, $0, $0 \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "mulsaq_s.w.ph $ac0, %[ax0], %[ax2] \n\t"
+ "dpaqx_s.w.ph $ac1, %[ax0], %[ax2] \n\t"
+ "lh %[ax0], 6(%[in2]) \n\t"
+ "lh %[ax1], 4(%[in2]) \n\t"
+ "lhx %[ax2], %[k21](%[tsin]) \n\t"
+ "lhx %[ax3], %[k21](%[tcos]) \n\t"
+ "append %[ax0], %[ax1], 16 \n\t"
+ "append %[ax2], %[ax3], 16 \n\t"
+ "multu $ac2, $0, $0 \n\t"
+ "mulsaq_s.w.ph $ac2, %[ax0], %[ax2] \n\t"
+ "multu $ac3, $0, $0 \n\t"
+ "dpaqx_s.w.ph $ac3, %[ax0], %[ax2] \n\t"
+ "extr_r.w %[t01], $ac0, 16 \n\t"
+ "extr_r.w %[t21], $ac1, 16 \n\t"
+ "extr_r.w %[t11], $ac2, 16 \n\t"
+ "extr_r.w %[t31], $ac3, 16 \n\t"
+
+ : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
+ [t01] "=r" (t01), [t21] "=r" (t21), [t11] "=r" (t11), [t31] "=r" (t31)
+ : [in2] "r" (in2), [k11] "r" (k11), [tsin] "r" (tsin),[tcos] "r" (tcos),
+ [z] "r" (z), [k21] "r" (k21)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+ "$ac3hi", "$ac3lo"
+ );
+
+ z[n8-k-1].re = t0;
+ z[n8+k ].im = t2;
+ z[n8-k-1].im = t21;
+ z[n8+k ].re = t01;
+
+ z[n8-k-2].re = t1;
+ z[n8+k+1].im = t3;
+ z[n8-k-2].im = t31;
+ z[n8+k+1].re = t11;
+ z[n8+k+1].im = t3;
+ }
+}
+
+static void ff_fft_fixed_calc_mips(FFTContext *s, FFTComplex *z)
+{
+
+ int nbits, i, n, num_transforms, offset, step;
+ int n4, n2, n34;
+ FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int step2;
+ int temp1, temp2, temp3, temp4;
+ int z0, z1, z2, z3;
+ int t12, t34, t56, t78, t0a, t1a, t2a, t3a;
+ int in1, in2, in3, in4;
+ FFTComplex *tmpz, *addr1, *addr2, *addr3;
+ int w_re, w_im;
+ FFTSample *w_re_ptr, *w_im_ptr;
+ int pom;
+ const int fft_size = (1 << s->nbits);
+
+ FFTComplex *tmpz_n2, *tmpz_n34, *tmpz_n4;
+ FFTComplex *tmpz_n2_i, *tmpz_n34_i, *tmpz_n4_i, *tmpz_i;
+
+ int z_re_n2, z_im_n2, z_re_n34, z_im_n34, z_re, z_im, z_re_n4, z_im_n4;
+
+ num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << 2;
+ tmpz = z + offset;
+
+ /* fft4 */
+ __asm__ volatile (
+ "lw %[z0], 0(%[tmpz]) \n\t"
+ "lw %[z1], 4(%[tmpz]) \n\t"
+ "lw %[z2], 8(%[tmpz]) \n\t"
+ "lw %[z3], 12(%[tmpz]) \n\t"
+ "addq.ph %[t12], %[z0], %[z1] \n\t"
+ "subq.ph %[t34], %[z0], %[z1] \n\t"
+ "addq.ph %[t56], %[z2], %[z3] \n\t"
+ "subq.ph %[t78], %[z2], %[z3] \n\t"
+ "addq.ph %[t0a], %[t12], %[t56] \n\t"
+ "packrl.ph %[t78], %[t78], %[t78] \n\t"
+ "subq.ph %[t2a], %[t12], %[t56] \n\t"
+ "addq.ph %[t1a], %[t34], %[t78] \n\t"
+ "subq.ph %[t3a], %[t34], %[t78] \n\t"
+ "packrl.ph %[t1a], %[t1a], %[t1a] \n\t"
+ "packrl.ph %[t3a], %[t3a], %[t3a] \n\t"
+ "sw %[t0a], 0(%[tmpz]) \n\t"
+ "packrl.ph %[z1], %[t1a], %[t3a] \n\t"
+ "packrl.ph %[z3], %[t3a], %[t1a] \n\t"
+ "sw %[t2a], 8(%[tmpz]) \n\t"
+ "sw %[z3], 4(%[tmpz]) \n\t"
+ "sw %[z1], 12(%[tmpz]) \n\t"
+
+ : [z0] "=&r" (z0), [z1] "=&r" (z1), [t12] "=&r" (t12),
+ [z2] "=&r" (z2), [z3] "=&r" (z3), [t34] "=&r" (t34),
+ [t56] "=&r" (t56), [t78] "=&r" (t78), [t0a] "=&r" (t0a),
+ [t1a] "=&r" (t1a), [t2a] "=&r" (t2a), [t3a] "=&r" (t3a)
+ : [tmpz] "r" (tmpz)
+ : "memory"
+ );
+ }
+
+ if (fft_size < 8)
+ return;
+
+ pom = 23170;
+
+ num_transforms = (num_transforms >> 1) | 1;
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << 3;
+ tmpz = z + offset;
+
+ /* fft8 */
+ __asm__ volatile (
+ "lw %[in1], 16(%[tmpz]) \t\n"
+ "lw %[in2], 20(%[tmpz]) \t\n"
+ "lw %[in3], 24(%[tmpz]) \t\n"
+ "lw %[in4], 28(%[tmpz]) \t\n"
+ "addq.ph %[temp1], %[in1], %[in2] \t\n"
+ "subq.ph %[temp3], %[in1], %[in2] \t\n"
+ "seh %[tmp1], %[temp1] \t\n"
+ "sra %[temp1], %[temp1], 16 \t\n"
+ "seh %[tmp2], %[temp1] \t\n"
+ "addq.ph %[temp2], %[in3], %[in4] \t\n"
+ "subq.ph %[temp4], %[in3], %[in4] \t\n"
+ "seh %[tmp3], %[temp2] \t\n"
+ "sra %[temp2], %[temp2], 16 \t\n"
+ "seh %[tmp4], %[temp2] \t\n"
+ "add %[tmp5], %[tmp1], %[tmp3] \t\n"
+ "sub %[tmp7], %[tmp1], %[tmp3] \t\n"
+ "add %[tmp6], %[tmp2], %[tmp4] \t\n"
+ "sub %[tmp8], %[tmp2], %[tmp4] \t\n"
+ "seh %[tmp1], %[temp3] \t\n"
+ "sra %[temp3], %[temp3], 16 \t\n"
+ "seh %[tmp2], %[temp3] \t\n"
+ "seh %[tmp3], %[temp4] \t\n"
+ "sra %[temp4], %[temp4], 16 \t\n"
+ "seh %[tmp4], %[temp4] \t\n"
+ "lw %[in1], 0(%[tmpz]) \t\n"
+ "move %[temp1], %[tmp6] \t\n"
+ "append %[temp1], %[tmp5], 16 \t\n"
+ "subq.ph %[temp3], %[in1], %[temp1] \t\n"
+ "addq.ph %[temp4], %[in1], %[temp1] \t\n"
+ "sw %[temp3], 16(%[tmpz]) \t\n"
+ "sw %[temp4], 0(%[tmpz]) \t\n"
+ "lw %[in2], 8(%[tmpz]) \t\n"
+ "negu %[temp1], %[tmp7] \t\n"
+ "append %[temp1], %[tmp8], 16 \t\n"
+ "subq.ph %[temp2], %[in2], %[temp1] \t\n"
+ "addq.ph %[temp3], %[in2], %[temp1] \t\n"
+ "sw %[temp2], 24(%[tmpz]) \t\n"
+ "sw %[temp3], 8(%[tmpz]) \t\n"
+ "add %[tmp5], %[tmp1], %[tmp2] \t\n"
+ "mul %[tmp5], %[tmp5], %[pom] \t\n"
+ "sub %[tmp6], %[tmp2], %[tmp1] \t\n"
+ "mul %[tmp6], %[tmp6], %[pom] \t\n"
+ "sub %[tmp7], %[tmp3], %[tmp4] \t\n"
+ "mul %[tmp7], %[tmp7], %[pom] \t\n"
+ "add %[tmp8], %[tmp3], %[tmp4] \t\n"
+ "mul %[tmp8], %[tmp8], %[pom] \t\n"
+ "shra_r.w %[tmp5], %[tmp5], 15 \t\n"
+ "lw %[in1], 4(%[tmpz]) \t\n"
+ "shra_r.w %[tmp6], %[tmp6], 15 \t\n"
+ "lw %[in2], 12(%[tmpz]) \t\n"
+ "shra_r.w %[tmp7], %[tmp7], 15 \t\n"
+ "add %[tmp1], %[tmp5], %[tmp7] \t\n"
+ "shra_r.w %[tmp8], %[tmp8], 15 \t\n"
+ "add %[tmp2], %[tmp6], %[tmp8] \t\n"
+ "sub %[tmp3], %[tmp5], %[tmp7] \t\n"
+ "sub %[tmp4], %[tmp6], %[tmp8] \t\n"
+ "move %[temp1], %[tmp2] \t\n"
+ "append %[temp1], %[tmp1], 16 \t\n"
+ "subq.ph %[temp2], %[in1], %[temp1] \t\n"
+ "addq.ph %[temp3], %[in1], %[temp1] \t\n"
+ "sw %[temp2], 20(%[tmpz]) \t\n"
+ "sw %[temp3], 4(%[tmpz]) \t\n"
+ "negu %[temp1], %[tmp3] \t\n"
+ "append %[temp1], %[tmp4], 16 \t\n"
+ "subq.ph %[temp2], %[in2], %[temp1] \t\n"
+ "addq.ph %[temp3], %[in2], %[temp1] \t\n"
+ "sw %[temp2], 28(%[tmpz]) \t\n"
+ "sw %[temp3], 12(%[tmpz]) \t\n"
+
+ : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
+ [tmp4] "=&r" (tmp4), [tmp5] "=&r" (tmp5), [tmp6] "=&r" (tmp6),
+ [tmp7] "=&r" (tmp7), [tmp8] "=&r" (tmp8), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
+ [in1] "=&r" (in1), [in2] "=&r" (in2), [in3] "=&r" (in3),
+ [in4] "=&r" (in4)
+ : [tmpz] "r" (tmpz), [pom] "r" (pom)
+ : "memory"
+ );
+ }
+
+ step = 1 << (MAX_LOG2_NFFT - 4);
+ n4 = 4;
+
+ for (nbits=4; nbits<=s->nbits; nbits++)
+ {
+ n2 = 2*n4;
+ n34 = 3*n4;
+ num_transforms = (num_transforms >> 1) | 1;
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << nbits;
+ tmpz = z + offset;
+
+ __asm__ volatile (
+ "sll %[z0], %[n2], 2 \n\t"
+ "sll %[z1], %[n34], 2 \n\t"
+ "sll %[z2], %[n4], 2 \n\t"
+ "addu %[addr1], %[tmpz], %[z0] \n\t"
+ "addu %[addr2], %[tmpz], %[z1] \n\t"
+ "addu %[addr3], %[tmpz], %[z2] \n\t"
+ "lw %[z0], 0(%[addr1]) \n\t"
+ "lw %[z1], 0(%[addr2]) \n\t"
+ "lw %[z2], 0(%[tmpz]) \n\t"
+ "sll %[step2], %[step], 2 \n\t"
+ "lw %[z3], 0(%[addr3]) \n\t"
+ "addq.ph %[t56], %[z0], %[z1] \n\t"
+ "subq.ph %[t12], %[z0], %[z1] \n\t"
+ "addq.ph %[t0a], %[z2], %[t56] \n\t"
+ "packrl.ph %[z3], %[z3], %[z3] \n\t"
+ "subq.ph %[t2a], %[z2], %[t56] \n\t"
+ "addq.ph %[t1a], %[z3], %[t12] \n\t"
+ "subq.ph %[t3a], %[z3], %[t12] \n\t"
+ "sw %[t0a], 0(%[tmpz]) \n\t"
+ "sw %[t2a], 0(%[addr1]) \n\t"
+ "packrl.ph %[z0], %[t1a], %[t3a] \n\t"
+ "packrl.ph %[z1], %[t3a], %[t1a] \n\t"
+ "sw %[z0], 0(%[addr2]) \n\t"
+ "sw %[z1], 0(%[addr3]) \n\t"
+
+ : [z0] "=&r" (z0), [z1] "=&r" (z1), [t12] "=&r" (t12),
+ [z2] "=&r" (z2), [z3] "=&r" (z3), [step2] "=&r" (step2),
+ [t56] "=&r" (t56), [t0a] "=&r" (t0a), [t1a] "=&r" (t1a),
+ [t2a] "=&r" (t2a), [t3a] "=&r" (t3a), [addr1] "=&r" (addr1),
+ [addr2] "=&r" (addr2), [addr3] "=&r" (addr3)
+ : [n2] "r" (n2), [n34] "r" (n34), [n4] "r" (n4), [tmpz] "r" (tmpz),
+ [step] "r" (step)
+ : "memory"
+ );
+
+ w_re_ptr = (FFTSample*)(ff_cos_65536_fixed + step);
+ w_im_ptr = (FFTSample*)(ff_cos_65536_fixed + MAX_FFT_SIZE/4 - step);
+
+ for (i=1; i<n4; i ++ )
+ {
+ w_re = w_re_ptr[0];
+ w_im = w_im_ptr[0];
+
+ tmpz_n2 = tmpz + n2;
+ tmpz_n4 = tmpz + n4;
+ tmpz_n34 = tmpz + n34;
+
+ tmpz_n2_i = tmpz_n2 + i;
+ tmpz_n4_i = tmpz_n4 + i;
+ tmpz_n34_i = tmpz_n34 + i;
+ tmpz_i = tmpz + i;
+
+ __asm__ volatile (
+ "lh %[z_re_n2], 0(%[tmpz_n2_i]) \n\t"
+ "lh %[z_im_n2], 2(%[tmpz_n2_i]) \n\t"
+ "lh %[z_re_n34], 0(%[tmpz_n34_i]) \n\t"
+ "lh %[z_im_n34], 2(%[tmpz_n34_i]) \n\t"
+ "mult $ac0, %[w_re], %[z_re_n2] \n\t"
+ "mult $ac2, %[w_re], %[z_re_n34] \n\t"
+ "mult $ac1, %[w_re], %[z_im_n2] \n\t"
+ "mult $ac3, %[w_re], %[z_im_n34] \n\t"
+ "madd $ac0, %[w_im], %[z_im_n2] \n\t"
+ "msub $ac2, %[w_im], %[z_im_n34] \n\t"
+ "msub $ac1, %[w_im], %[z_re_n2] \n\t"
+ "madd $ac3, %[w_im], %[z_re_n34] \n\t"
+ "lh %[z_re], 0(%[tmpz_i]) \n\t"
+ "extr_r.w %[tmp1], $ac0, 15 \n\t"
+ "extr_r.w %[tmp3], $ac2, 15 \n\t"
+ "extr_r.w %[tmp2], $ac1, 15 \n\t"
+ "extr_r.w %[tmp4], $ac3, 15 \n\t"
+ "lh %[z_im], 2(%[tmpz_i]) \n\t"
+ "lh %[z_re_n4], 0(%[tmpz_n4_i]) \n\t"
+ "lh %[z_im_n4], 2(%[tmpz_n4_i]) \n\t"
+ "add %[tmp5], %[tmp1], %[tmp3] \n\t"
+ "sub %[tmp1], %[tmp1], %[tmp3] \n\t"
+ "add %[tmp6], %[tmp2], %[tmp4] \n\t"
+ "sub %[tmp2], %[tmp2], %[tmp4] \n\t"
+ "subq_s.ph %[z_re_n2], %[z_re], %[tmp5] \n\t"
+ "addq_s.ph %[z_re], %[z_re], %[tmp5] \n\t"
+ "subq_s.ph %[z_im_n2], %[z_im], %[tmp6] \n\t"
+ "addq_s.ph %[z_im], %[z_im], %[tmp6] \n\t"
+ "sh %[z_re_n2], 0(%[tmpz_n2_i]) \n\t"
+ "sh %[z_re], 0(%[tmpz_i]) \n\t"
+ "sh %[z_im_n2], 2(%[tmpz_n2_i]) \n\t"
+ "sh %[z_im], 2(%[tmpz_i]) \n\t"
+ "subq_s.ph %[z_re_n34], %[z_re_n4], %[tmp2] \n\t"
+ "addq_s.ph %[z_re_n4], %[z_re_n4], %[tmp2] \n\t"
+ "addq_s.ph %[z_im_n34], %[z_im_n4], %[tmp1] \n\t"
+ "subq_s.ph %[z_im_n4], %[z_im_n4], %[tmp1] \n\t"
+ "sh %[z_re_n34], 0(%[tmpz_n34_i]) \n\t"
+ "sh %[z_re_n4], 0(%[tmpz_n4_i]) \n\t"
+ "sh %[z_im_n34], 2(%[tmpz_n34_i]) \n\t"
+ "sh %[z_im_n4], 2(%[tmpz_n4_i]) \n\t"
+
+ : [z_re_n2] "=&r" (z_re_n2), [z_re] "=&r" (z_re), [z_im] "=&r" (z_im),
+ [z_im_n2] "=&r" (z_im_n2), [z_re_n34] "=&r" (z_re_n34),
+ [z_im_n4] "=&r" (z_im_n4), [z_re_n4] "=&r" (z_re_n4),
+ [z_im_n34] "=&r" (z_im_n34), [tmp1] "=r" (tmp1),
+ [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+ [tmp5] "=&r" (tmp5), [tmp6] "=&r" (tmp6)
+ : [w_re] "r" (w_re), [w_im] "r" (w_im), [tmpz_n2_i] "r" (tmpz_n2_i),
+ [tmpz_n34_i] "r" (tmpz_n34_i), [tmpz_n4_i] "r" (tmpz_n4_i),
+ [tmpz_i] "r" (tmpz_i)
+ : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+ "$ac3hi", "$ac3lo"
+ );
+ w_re_ptr += step;
+ w_im_ptr -= step;
+ }
+ }
+ step >>= 1;
+ n4 <<= 1;
+ }
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_fft_fixed_init_mips(FFTContext *s) {
+
+#if HAVE_INLINE_ASM
+ int n=0;
+ ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
+
+#if CONFIG_MDCT
+ s->imdct_half_fixed = ff_imdct_fixed_half_mips;
+#endif /* CONFIG_MDCT */
+ s->fft_fixed_calc = ff_fft_fixed_calc_mips;
+#endif /* HAVE_INLINE_ASM */
+}
--
1.7.3.4
More information about the ffmpeg-devel
mailing list