[FFmpeg-devel] [PATCH] Optimization for add_8x8basis
Zuxy Meng
zuxy.meng
Tue May 15 17:43:19 CEST 2007
Hi,
2007/5/15, Michael Niedermayer <michaelni at gmx.at>:
> Hi
>
> you can use svn annotate dsputil_mmx.c and svn log dsputil_mmx.c
> to find that out ...
How about this one?
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c ?????? 9017??
+++ libavcodec/i386/dsputil_mmx.c ????????????
@@ -2750,91 +2750,69 @@
}
#ifdef CONFIG_ENCODERS
-static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
- long i=0;
- assert(FFABS(scale) < 256);
- scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
+#define PHADDD(a, t)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddd "#t", "#a" \n\t"
+/*
+ pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
+ pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
+ pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
+ */
+#define PMULHRW(x, y, s, o)\
+ "pmulhw " #s ", "#x " \n\t"\
+ "pmulhw " #s ", "#y " \n\t"\
+ "paddw " #o ", "#x " \n\t"\
+ "paddw " #o ", "#y " \n\t"\
+ "psraw $1, "#x " \n\t"\
+ "psraw $1, "#y " \n\t"
+#define DEF(x) x ## _mmx
+#define SET_RND MOVQ_WONE
+#define SCALE_OFFSET 1
- asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
- "pxor %%mm7, %%mm7 \n\t"
- "movd %4, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
- "psraw $6, %%mm1 \n\t"
- "pmullw (%3, %0), %%mm0 \n\t"
- "pmullw 8(%3, %0), %%mm1 \n\t"
- "pmaddwd %%mm0, %%mm0 \n\t"
- "pmaddwd %%mm1, %%mm1 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "psrld $4, %%mm0 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $32, %%mm7 \n\t"
- "paddd %%mm6, %%mm7 \n\t"
- "psrld $2, %%mm7 \n\t"
- "movd %%mm7, %0 \n\t"
+#include "dsputil_mmx_qns.h"
- : "+r" (i)
- : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
- );
- return i;
-}
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
-static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
- long i=0;
+#define DEF(x) x ## _3dnow
+#define SET_RND(x)
+#define SCALE_OFFSET 0
+#define PMULHRW(x, y, s, o)\
+ "pmulhrw " #s ", "#x " \n\t"\
+ "pmulhrw " #s ", "#y " \n\t"
- if(FFABS(scale) < 256){
- scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
- asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
+#include "dsputil_mmx_qns.h"
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- );
- }else{
- for(i=0; i<8*8; i++){
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#ifdef HAVE_SSSE3
+#undef PHADDD
+#define DEF(x) x ## _ssse3
+#define SET_RND(x)
+#define SCALE_OFFSET -1
+#define PHADDD(a, t)\
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
+#define PMULHRW(x, y, s, o)\
+ "pmulhrsw " #s ", "#x " \n\t"\
+ "pmulhrsw " #s ", "#y " \n\t"
+
+#include "dsputil_mmx_qns.h"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+#undef PHADDD
+#endif //HAVE_SSSE3
+
#endif /* CONFIG_ENCODERS */
#define PREFETCH(name, op) \
@@ -3646,6 +3624,10 @@
#ifdef HAVE_SSSE3
if(mm_flags & MM_SSSE3){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_ssse3;
+ }
+ c->add_8x8basis= add_8x8basis_ssse3;
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@@ -3667,6 +3649,12 @@
#endif
if(mm_flags & MM_3DNOW){
+#ifdef CONFIG_ENCODERS
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_3dnow;
+ }
+ c->add_8x8basis= add_8x8basis_3dnow;
+#endif //CONFIG_ENCODERS
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT))
Index: libavcodec/i386/dsputil_mmx_qns.h
===================================================================
--- libavcodec/i386/dsputil_mmx_qns.h ?????? 0??
+++ libavcodec/i386/dsputil_mmx_qns.h ?????? 0??
@@ -0,0 +1,102 @@
+/*
+ * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Michael Niedermayer <michaelni at gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng at gmail.com>
+ */
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+ long i=0;
+
+ assert(FFABS(scale) < MAX_ABS);
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+ SET_RND(mm6);
+ asm volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movd %4, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "psraw $6, %%mm0 \n\t"
+ "psraw $6, %%mm1 \n\t"
+ "pmullw (%3, %0), %%mm0 \n\t"
+ "pmullw 8(%3, %0), %%mm1 \n\t"
+ "pmaddwd %%mm0, %%mm0 \n\t"
+ "pmaddwd %%mm1, %%mm1 \n\t"
+ "paddd %%mm1, %%mm0 \n\t"
+ "psrld $4, %%mm0 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" //FIXME optimize & bench
+ " jb 1b \n\t"
+ PHADDD(%%mm7, %%mm6)
+ "psrld $2, %%mm7 \n\t"
+ "movd %%mm7, %0 \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+ );
+ return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+ long i=0;
+
+ if(FFABS(scale) < MAX_ABS){
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+ SET_RND(mm6);
+ asm volatile(
+ "movd %3, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "movq %%mm0, (%2, %0) \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" // FIXME optimize & bench
+ " jb 1b \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "g"(scale)
+ );
+ }else{
+ for(i=0; i<8*8; i++){
+ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+ }
+ }
+}
+
More information about the ffmpeg-devel
mailing list