[FFmpeg-cvslog] twinvq: add SSE/AVX optimized sum/difference stereo interleaving
Justin Ruggles
git at videolan.org
Sat Nov 12 03:02:02 CET 2011
ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Sun Oct 30 01:13:55 2011 -0400| [9d06037d48041ad8ccbae6c12aa9f3a313a89c4e] | committer: Justin Ruggles
twinvq: add SSE/AVX optimized sum/difference stereo interleaving
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9d06037d48041ad8ccbae6c12aa9f3a313a89c4e
---
libavcodec/dsputil.c | 13 ++++++++++
libavcodec/dsputil.h | 17 +++++++++++++
libavcodec/twinvq.c | 34 +++++++++++++--------------
libavcodec/x86/dsputil_mmx.c | 7 +++++
libavcodec/x86/dsputil_yasm.asm | 48 +++++++++++++++++++++++++++++++++++++++
5 files changed, 101 insertions(+), 18 deletions(-)
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 182063c..9123857 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
}
}
+static void butterflies_float_interleave_c(float *dst, const float *src0,
+ const float *src1, int len)
+{
+ int i;
+ for (i = 0; i < len; i++) {
+ float f1 = src0[i];
+ float f2 = src1[i];
+ dst[2*i ] = f1 + f2;
+ dst[2*i + 1] = f1 - f2;
+ }
+}
+
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
{
float p = 0.0;
@@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->vector_clip_int32 = vector_clip_int32_c;
c->scalarproduct_float = scalarproduct_float_c;
c->butterflies_float = butterflies_float_c;
+ c->butterflies_float_interleave = butterflies_float_interleave_c;
c->vector_fmul_scalar = vector_fmul_scalar_c;
c->vector_fmac_scalar = vector_fmac_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index acb2041..98b7b1e 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -453,6 +453,23 @@ typedef struct DSPContext {
*/
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
+ /**
+ * Calculate the sum and difference of two vectors of floats and interleave
+ * results into a separate output vector of floats, with each sum
+ * positioned before the corresponding difference.
+ *
+ * @param dst output vector
+ * constraints: 16-byte aligned
+ * @param src0 first input vector
+ * constraints: 32-byte aligned
+ * @param src1 second input vector
+ * constraints: 32-byte aligned
+ * @param len number of elements in the input
+ * constraints: multiple of 8
+ */
+ void (*butterflies_float_interleave)(float *dst, const float *src0,
+ const float *src1, int len);
+
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
void (*fdct248)(DCTELEM *block/* align 16*/);
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 73eb7c1..a285156 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
float *out)
{
const ModeTab *mtab = tctx->mtab;
+ int size1, size2;
float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
- int i, j;
+ int i;
for (i = 0; i < tctx->avctx->channels; i++) {
imdct_and_window(tctx, ftype, wtype,
@@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
i);
}
+ size2 = tctx->last_block_pos[0];
+ size1 = mtab->size - size2;
if (tctx->avctx->channels == 2) {
- for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
- float f1 = prev_buf[ i];
- float f2 = prev_buf[2*mtab->size + i];
- out[2*i ] = f1 + f2;
- out[2*i + 1] = f1 - f2;
- }
- for (j = 0; i < mtab->size; j++,i++) {
- float f1 = tctx->curr_frame[ j];
- float f2 = tctx->curr_frame[2*mtab->size + j];
- out[2*i ] = f1 + f2;
- out[2*i + 1] = f1 - f2;
- }
+ tctx->dsp.butterflies_float_interleave(out, prev_buf,
+ &prev_buf[2*mtab->size],
+ size1);
+
+ out += 2 * size1;
+
+ tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
+ &tctx->curr_frame[2*mtab->size],
+ size2);
} else {
- memcpy(out, prev_buf,
- (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
+ memcpy(out, prev_buf, size1 * sizeof(*out));
- out += mtab->size - tctx->last_block_pos[0];
+ out += size1;
- memcpy(out, tctx->curr_frame,
- (tctx->last_block_pos[0]) * sizeof(*out));
+ memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
}
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index dd6cbf5..f0de05a 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
+extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
+ const float *src1, int len);
+extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
+ const float *src1, int len);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
@@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->vector_clipf = vector_clipf_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
#endif
}
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
@@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
}
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
}
#endif
}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 8e3cbdc..f2894cd 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
%else
VECTOR_CLIP_INT32 6, 1, 0, 0
%endif
+
+;-----------------------------------------------------------------------------
+; void ff_butterflies_float_interleave(float *dst, const float *src0,
+; const float *src1, int len);
+;-----------------------------------------------------------------------------
+
+%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
+cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
+%ifdef ARCH_X86_64
+ movsxd lenq, lend
+%endif
+ test lenq, lenq
+ jz .end
+ shl lenq, 2
+ lea src0q, [src0q + lenq]
+ lea src1q, [src1q + lenq]
+ lea dstq, [ dstq + 2*lenq]
+ neg lenq
+.loop:
+ mova m0, [src0q + lenq]
+ mova m1, [src1q + lenq]
+ subps m2, m0, m1
+ addps m0, m0, m1
+ unpcklps m1, m0, m2
+ unpckhps m0, m0, m2
+%if cpuflag(avx)
+ vextractf128 [dstq + 2*lenq ], m1, 0
+ vextractf128 [dstq + 2*lenq + 16], m0, 0
+ vextractf128 [dstq + 2*lenq + 32], m1, 1
+ vextractf128 [dstq + 2*lenq + 48], m0, 1
+%else
+ mova [dstq + 2*lenq ], m1
+ mova [dstq + 2*lenq + mmsize], m0
+%endif
+ add lenq, mmsize
+ jl .loop
+%if mmsize == 32
+ vzeroupper
+ RET
+%endif
+.end:
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+BUTTERFLIES_FLOAT_INTERLEAVE
+INIT_YMM avx
+BUTTERFLIES_FLOAT_INTERLEAVE
More information about the ffmpeg-cvslog
mailing list