[FFmpeg-devel] [PATCH] VC-1 MMX DSP functions
Zuxy Meng
zuxy.meng
Sun Jul 8 17:14:17 CEST 2007
Hi,
2007/7/8, Zuxy Meng <zuxy.meng at gmail.com>:
> Hi,
>
> 2007/7/7, Christophe GISQUET <christophe.gisquet at free.fr>:
> > Hello,
> >
> > here are the MMX functions now licensed under the MIT license.
> >
> > Zuxy Meng has been working on SSE2 versions of those; I'm not sure if he
> > would agree to contribute to this file using MIT license. In that case,
> > I don't mind the license being changed, but I would prefer having the
> > MIT licensing available in the svn history.
>
> I care less about license issues than raw performance :-)
>
> I did a quick test on 64-bit K8 tonight thanks to Stephan's testbed.
> The result wasn't promising. In short, from fastest to slowest:
> MMX > SSE2 w/o sw pipeling > SSE2 w/ sw pipeling
>
> The reason may be that on K8 SSE2 is thoughput bound (K8 can decode 3
> MMX instructions per cycle, but only 1.5 SSE2 ones), and sw pipeling
> increase the # of instructions per loop. If AMD does what they've
> promised on their upcoming K10, I guess the result will be:
> SSE2 w/o sw pipeling > SSE2 w/ sw pipeling > MMX
>
> And IIRC on your 32-bit Conroe, where SSE2 is latency bound (punpcklbw
> and unaligned movq are slow), the list is somewhat different:
> SSE2 w/ sw pipeling > MMX > SSE2 w/o sw pipeling
>
> On my Dothan:
> MMX > SSE2 w/ sw pipeling > SSE2 w/o sw pipeling
>
> So the conclusion is that I can't make a conclusion. Any suggestions?
I just tried to unroll the loop so the # of instructions per loop
remains the same after being sw pipelined and the speed improves a
little bit:
Now SSE2 is about the same speed as MMX (+- 0.5%) both on my Dothan
and Stephan's 64-bit K8.
Attached isn't against Christophe's newest version and may look ugly,
but it serves as base for further improvement.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
/*
* VC-1 and WMV3 decoder - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License version 2.0 as published by the Free Software Foundation.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "dsputil.h"
#include "x86_cpu.h"
DECLARE_ALIGNED_16(static const uint64_t,ff_fact_53[2] ) = { 0x0035003500350035ULL, 0x0035003500350035ULL };
DECLARE_ALIGNED_16(static const uint64_t,ff_fact_18[2] ) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_MMX(SHIFT) \
"paddsw %%mm7, %%mm3 \n\t" /* +bias-r */ \
"paddsw %%mm7, %%mm4 \n\t" /* +bias-r */ \
"psraw $"SHIFT", %%mm3 \n\t" \
"psraw $"SHIFT", %%mm4 \n\t" \
"packuswb %%mm4, %%mm3 \n\t" \
"movq %%mm3, (%2) \n\t"
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_MMX \
"movd %7, %%mm7 \n\t" \
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t" /* pshufw */
/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_SSE2(SHIFT, XMM) \
"paddsw %%xmm7, "XMM" \n\t" /* +bias-r */ \
"psraw $"SHIFT", "XMM" \n\t" \
"packuswb "XMM", "XMM" \n\t" \
"movq "XMM", (%2) \n\t"
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_SSE2 \
"movd %7, %%xmm7 \n\t" \
"punpcklwd %%xmm7, %%xmm7 \n\t" \
"pshufd $0, %%xmm7, %%xmm7 \n\t"
/** 1/2 shift for MMX instruction set */
static void vc1_put_shift2_mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset)
{
src -= offset;
rnd = 8-rnd;
asm volatile(
LOAD_ROUNDER_MMX
ASMALIGN(3)
"1: \n\t"
"movd 0(%1,%5 ), %%mm3 \n\t"
"movd 4(%1,%5 ), %%mm4 \n\t"
"movd 0(%1,%5,2), %%mm1 \n\t"
"movd 4(%1,%5,2), %%mm2 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t"
"punpcklbw %%mm0, %%mm4 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t"
"paddw %%mm1, %%mm3 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm1 \n\t"
"movq %%mm4, %%mm2 \n\t"
"psllw $3, %%mm3 \n\t" /* 8* */
"psllw $3, %%mm4 \n\t" /* 8* */
"paddw %%mm1, %%mm3 \n\t" /* 9,9 */
"paddw %%mm2, %%mm4 \n\t" /* 9,9 */
"movd 0(%1 ), %%mm1 \n\t"
"movd 4(%1 ), %%mm2 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t"
"psubsw %%mm1, %%mm3 \n\t" /* -1,9,9 */
"psubsw %%mm2, %%mm4 \n\t" /* -1,9,9 */
"movd 0(%1,%6 ), %%mm1 \n\t"
"movd 4(%1,%6 ), %%mm2 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t"
"psubsw %%mm1, %%mm3 \n\t"
"psubsw %%mm2, %%mm4 \n\t"
NORMALIZE_MMX("4")
"add %3, %1 \n\t"
"add %4, %2 \n\t"
"dec %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r" (src), "+r" (dst)
: "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)
: "memory"
);
}
/**
* Macro to build vc1_put_shift[12].
* Parameters passed must use %5 (+offset) and %6 (-offset).
*
* @param NAME Either 1 or 3
* @param A1 Offset for tap having coefficient -3
* @param A2 Offset for tap having coefficient 18
* @param A3 Offset for tap having coefficient 53
* @param A4 Offset for tap having coefficient -4
*/
#ifndef ARCH_X86_64
#define MSPEL_FILTER13(NAME, A1, A2, A3, A4) \
static void vc1_put_shift ## NAME ## _mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{ \
src -= offset; \
rnd = 32-rnd; \
asm volatile( \
LOAD_ROUNDER_MMX \
ASMALIGN(3) \
"1: \n\t" \
"movd 0"A1", %%mm1 \n\t" \
"movd 4"A1", %%mm2 \n\t" \
"punpcklbw %%mm0, %%mm1 \n\t" \
"punpcklbw %%mm0, %%mm2 \n\t" \
"movq %%mm1, %%mm3 \n\t" \
"movq %%mm2, %%mm4 \n\t" \
"paddw %%mm1, %%mm1 \n\t" \
"paddw %%mm2, %%mm2 \n\t" \
"paddsw %%mm3, %%mm1 \n\t" /* 3* */ \
"paddsw %%mm4, %%mm2 \n\t" /* 3* */ \
"movd 0"A2", %%mm3 \n\t" \
"movd 4"A2", %%mm4 \n\t" \
"punpcklbw %%mm0, %%mm3 \n\t" \
"punpcklbw %%mm0, %%mm4 \n\t" \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
"psubsw %%mm1, %%mm3 \n\t" /*18,-3 */ \
"psubsw %%mm2, %%mm4 \n\t" /*18,-3 */ \
"movd 0"A3", %%mm1 \n\t" \
"movd 4"A3", %%mm2 \n\t" \
"punpcklbw %%mm0, %%mm1 \n\t" \
"punpcklbw %%mm0, %%mm2 \n\t" \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
"paddsw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
"paddsw %%mm2, %%mm4 \n\t" /*53,18,-3 */ \
"movd 0"A4", %%mm1 \n\t" \
"movd 4"A4", %%mm2 \n\t" \
"punpcklbw %%mm0, %%mm1 \n\t" \
"punpcklbw %%mm0, %%mm2 \n\t" \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psllw $2, %%mm2 \n\t" /* 4* */ \
"psubsw %%mm1, %%mm3 \n\t" \
"psubsw %%mm2, %%mm4 \n\t" \
NORMALIZE_MMX("6") \
"add %3, %1 \n\t" \
"add %4, %2 \n\t" \
"dec %0 \n\t" \
"jnz 1b \n\t" \
: "+g"(h), "+r" (src), "+r" (dst) \
: "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd) \
: "memory" \
); \
}; \
\
static void vc1_put_shift ## NAME ## _sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{ \
src -= offset; \
rnd = 32-rnd; \
asm volatile( \
LOAD_ROUNDER_SSE2 \
"movq "A2", %%xmm5 \n\t" \
"movq "A1", %%xmm6 \n\t" \
"punpcklbw %%xmm0, %%xmm5 \n\t"\
"punpcklbw %%xmm0, %%xmm6 \n\t"\
ASMALIGN(3) \
"1: \n\t" \
"movq "A4", %%xmm1 \n\t" \
"movq "A3", %%xmm2 \n\t" \
"punpcklbw %%xmm0, %%xmm1 \n\t" \
"punpcklbw %%xmm0, %%xmm2 \n\t" \
"add %3, %1 \n\t" \
"movdqa %%xmm5, %%xmm3\n\t"\
"movdqa %%xmm6, %%xmm4\n\t"\
"movq "A2", %%xmm5 \n\t" \
"movq "A1", %%xmm6 \n\t" \
"punpcklbw %%xmm0, %%xmm5 \n\t" \
"punpcklbw %%xmm0, %%xmm6 \n\t" \
"pmullw %8, %%xmm2 \n\t" /* *53 */ \
"psllw $2, %%xmm1 \n\t" /* *4 */ \
"pmullw %9, %%xmm3 \n\t" /* *18 */ \
"psubsw %%xmm4, %%xmm2 \n\t" /* 53,-1 */ \
"paddsw %%xmm4, %%xmm1 \n\t" /* 4,1 */ \
"psubsw %%xmm4, %%xmm3 \n\t" /* 18,-1 */ \
"psubsw %%xmm1, %%xmm2 \n\t" /* -4,53,-2 */ \
"paddsw %%xmm2, %%xmm3 \n\t" /* -4,53,18,-3 */ \
NORMALIZE_SSE2("6", "%%xmm3") \
"add %4, %2 \n\t" \
"dec %0 \n\t" \
"jnz 1b \n\t" \
: "+g"(h), "+r" (src), "+r" (dst) \
: "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd), "m"(*ff_fact_53), "m"(*ff_fact_18) \
: "memory" \
); \
}
#else
#define MSPEL_FILTER13(NAME, A1, A2, A3, A4) \
static void vc1_put_shift ## NAME ## _mmx(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) {}\
static void vc1_put_shift ## NAME ## _sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset) \
{ \
src -= offset; \
rnd = 32-rnd; \
asm volatile( \
LOAD_ROUNDER_SSE2 \
"movq "A4", %%xmm11 \n\t" \
"movq "A3", %%xmm12 \n\t" \
"movq "A2", %%xmm5 \n\t" \
"movq "A1", %%xmm6 \n\t" \
"punpcklbw %%xmm0, %%xmm11 \n\t" \
"punpcklbw %%xmm0, %%xmm12 \n\t" \
"punpcklbw %%xmm0, %%xmm5 \n\t"\
"punpcklbw %%xmm0, %%xmm6 \n\t"\
ASMALIGN(3) \
"1: \n\t" \
"add %3, %1 \n\t" \
"movq "A4", %%xmm1 \n\t" \
"movq "A3", %%xmm2 \n\t" \
"movq "A2", %%xmm3 \n\t" \
"movq "A1", %%xmm4 \n\t" \
"punpcklbw %%xmm0, %%xmm1 \n\t" \
"punpcklbw %%xmm0, %%xmm2 \n\t" \
"punpcklbw %%xmm0, %%xmm3 \n\t" \
"punpcklbw %%xmm0, %%xmm4 \n\t" \
"pmullw %%xmm8, %%xmm12 \n\t" /* *53 */ \
"psllw $2, %%xmm11 \n\t" /* *4 */ \
"pmullw %%xmm9, %%xmm5 \n\t" /* *18 */ \
"psubsw %%xmm6, %%xmm12 \n\t" /* 53,-1 */ \
"paddsw %%xmm6, %%xmm11 \n\t" /* 4,1 */ \
"psubsw %%xmm6, %%xmm5 \n\t" /* 18,-1 */ \
"psubsw %%xmm11, %%xmm12 \n\t" /* -4,53,-2 */ \
"paddsw %%xmm12, %%xmm5 \n\t" /* -4,53,18,-3 */ \
NORMALIZE_SSE2("6", "%%xmm5") \
"add %4, %2 \n\t" \
"dec %0 \n\t" \
"jz 1f \n\t" \
"add %3, %1 \n\t" \
"movq "A4", %%xmm11 \n\t" \
"movq "A3", %%xmm12 \n\t" \
"movq "A2", %%xmm5 \n\t" \
"movq "A1", %%xmm6 \n\t" \
"punpcklbw %%xmm0, %%xmm11 \n\t" \
"punpcklbw %%xmm0, %%xmm12 \n\t" \
"punpcklbw %%xmm0, %%xmm5 \n\t" \
"punpcklbw %%xmm0, %%xmm6 \n\t" \
"pmullw %%xmm8, %%xmm2 \n\t" /* *53 */ \
"psllw $2, %%xmm1 \n\t" /* *4 */ \
"pmullw %%xmm9, %%xmm3 \n\t" /* *18 */ \
"psubsw %%xmm4, %%xmm2 \n\t" /* 53,-1 */ \
"paddsw %%xmm4, %%xmm1 \n\t" /* 4,1 */ \
"psubsw %%xmm4, %%xmm3 \n\t" /* 18,-1 */ \
"psubsw %%xmm1, %%xmm2 \n\t" /* -4,53,-2 */ \
"paddsw %%xmm2, %%xmm3 \n\t" /* -4,53,18,-3 */ \
NORMALIZE_SSE2("6", "%%xmm3") \
"add %4, %2 \n\t" \
"dec %0 \n\t" \
"jnz 1b\n\t"\
"1:\n\t"\
"nop\n\t"\
: "+g"(h), "+r" (src), "+r" (dst) \
: "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd) \
: "memory" \
); \
}
#endif
/** 1/4 shift MMX and SSE2 */
MSPEL_FILTER13(1, "(%1,%6 )", "(%1,%5,2)", "(%1,%5 )", "(%1 )")
/** 3/4 shift MMX and SSE2 */
MSPEL_FILTER13(3, "(%1 )", "(%1,%5 )", "(%1,%5,2)", "(%1,%6 )")
/** 1/2 shift for SSE2 instruction set */
static void vc1_put_shift2_sse2(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset)
{
src -= offset;
rnd = 8-rnd;
asm volatile(
LOAD_ROUNDER_SSE2
#ifdef ARCH_X86_64
"movq (%1 ), %%xmm11 \n\t"
"movq (%1,%5 ), %%xmm12 \n\t"
#endif
"movq (%1,%5,2), %%xmm5 \n\t"
"movq (%1,%6 ), %%xmm6 \n\t"
#ifdef ARCH_X86_64
"punpcklbw %%xmm0, %%xmm11 \n\t"
"punpcklbw %%xmm0, %%xmm12 \n\t"
#endif
"punpcklbw %%xmm0, %%xmm5 \n\t"
"punpcklbw %%xmm0, %%xmm6 \n\t"
ASMALIGN(3)
"1: \n\t"
#ifndef ARCH_X86_64
"movq (%1 ), %%xmm1 \n\t"
"movq (%1,%5 ), %%xmm2 \n\t"
"punpcklbw %%xmm0, %%xmm1 \n\t"
"punpcklbw %%xmm0, %%xmm2 \n\t"
#endif
"add %3, %1 \n\t"
#ifdef ARCH_X86_64
"movq (%1 ), %%xmm1 \n\t"
"movq (%1,%5 ), %%xmm2 \n\t"
#endif
"movq (%1,%5,2), %%xmm3 \n\t"
"movq (%1,%6 ), %%xmm4 \n\t"
#ifdef ARCH_X86_64
"punpcklbw %%xmm0, %%xmm1 \n\t"
"punpcklbw %%xmm0, %%xmm2 \n\t"
#endif
"punpcklbw %%xmm0, %%xmm3 \n\t"
"punpcklbw %%xmm0, %%xmm4 \n\t"
#ifndef ARCH_X86_64
"paddsw %%xmm2, %%xmm5 \n\t"
"paddsw %%xmm1, %%xmm6 \n\t"
"movdqa %%xmm5, %%xmm2 \n\t"
"psllw $3, %%xmm5 \n\t" /* 8* */
"paddw %%xmm2, %%xmm5 \n\t" /* 9,9 */
"psubsw %%xmm6, %%xmm5 \n\t"
#else
"paddsw %%xmm12, %%xmm5 \n\t"
"paddsw %%xmm11, %%xmm6 \n\t"
"movdqa %%xmm5, %%xmm12 \n\t"
"psllw $3, %%xmm5 \n\t" /* 8* */
"paddw %%xmm12, %%xmm5 \n\t" /* 9,9 */
"psubsw %%xmm6, %%xmm5 \n\t"
#endif
NORMALIZE_SSE2("4", "%%xmm5")
"add %4, %2 \n\t"
"dec %0 \n\t"
"jz 1f \n\t"
#ifndef ARCH_X86_64
"movq (%1 ), %%xmm1 \n\t"
"movq (%1,%5 ), %%xmm2 \n\t"
"punpcklbw %%xmm0, %%xmm1 \n\t"
"punpcklbw %%xmm0, %%xmm2 \n\t"
#endif
"add %3, %1 \n\t"
#ifdef ARCH_X86_64
"movq (%1 ), %%xmm11 \n\t"
"movq (%1,%5 ), %%xmm12 \n\t"
#endif
"movq (%1,%5,2), %%xmm5 \n\t"
"movq (%1,%6 ), %%xmm6 \n\t"
#ifdef ARCH_X86_64
"punpcklbw %%xmm0, %%xmm11 \n\t"
"punpcklbw %%xmm0, %%xmm12 \n\t"
#endif
"punpcklbw %%xmm0, %%xmm5 \n\t"
"punpcklbw %%xmm0, %%xmm6 \n\t"
"paddsw %%xmm2, %%xmm3 \n\t"
"paddsw %%xmm1, %%xmm4 \n\t"
"movdqa %%xmm3, %%xmm2 \n\t"
"psllw $3, %%xmm3 \n\t" /* 8* */
"paddw %%xmm2, %%xmm3 \n\t" /* 9,9 */
"psubsw %%xmm4, %%xmm3 \n\t"
NORMALIZE_SSE2("4", "%%xmm3")
"add %4, %2 \n\t"
"dec %0 \n\t"
"jnz 1b \n\t"
"1:\n\t"
"nop\n\t"
: "+g"(h), "+r" (src), "+r" (dst)
: "g"((intptr_t)sstr), "g"((intptr_t)dstr), "r"((intptr_t)offset), "r"((intptr_t)(3*offset)), "g"(rnd)
: "memory"
);
}
extern void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h);
typedef void (*vc1_mspel_mc_filter)(uint8_t *dst, int dstr, const uint8_t *src, int sstr, int h, int rnd, int offset);
/** Interpolates fractional pel values using MMX */
static inline void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, int mode, int rnd, const int sse2)
{
const uint8_t *tptr;
int tptrstr;
int mode1 = mode & 3;
int mode2 = (mode >> 2) & 3;
DECLARE_ALIGNED_16(uint8_t, tmp[8*11]);
vc1_mspel_mc_filter vc1_put_shift[4];
if (sse2) {
vc1_put_shift[1] = vc1_put_shift1_sse2;
vc1_put_shift[2] = vc1_put_shift2_sse2;
vc1_put_shift[3] = vc1_put_shift3_sse2;
asm volatile(
"pxor %%xmm0, %%xmm0 \n\t"
#ifdef ARCH_X86_64
"movdqa %0, %%xmm8 \n\t"
"movdqa %1, %%xmm9 \n\t"
#endif
:: "m"(*ff_fact_53), "m"(*ff_fact_18)
);
} else {
vc1_put_shift[1] = vc1_put_shift1_mmx;
vc1_put_shift[2] = vc1_put_shift2_mmx;
vc1_put_shift[3] = vc1_put_shift3_mmx;
asm volatile(
"pxor %%mm0, %%mm0 \n\t"
"movq %0, %%mm5 \n\t"
"movq %1, %%mm6 \n\t"
:: "m"(*ff_fact_53), "m"(*ff_fact_18)
);
}
/* Translation: tmp=src-stride, tmp+8=src, ... */
if (mode1) { /* Horizontal filter to apply */
if (mode2) { /* Vertical filter to apply, output to tmp */
vc1_put_shift[mode1](tmp, 8, src-stride, stride, 11, rnd, 1);
tptr = tmp+8;
tptrstr = 8;
} else { /* No vertical filter, output 8 lines to dst */
//fprintf(stderr, "mode1 noV\n"); fflush(stderr);
vc1_put_shift[mode1](dst, stride, src, stride, 8, rnd, 1);
return;
}
} else {
/* No horizontal filter, use directly src as input */
tptr = src;
tptrstr = stride;
/* put_vc1_mspel_mc00_mmx directly calls put_pixels8_mmx */
}
vc1_put_shift[mode2](dst, stride, tptr, tptrstr, 8, 1-rnd, tptrstr);
}
static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
put_pixels8_mmx(dst, src, stride, 8);
}
#define DECLARE_FUNCTIONS(a, b) \
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
vc1_mspel_mc(dst, src, stride, a + (b<<2), rnd, 0); \
}; \
static void put_vc1_mspel_mc ## a ## b ## _sse2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
vc1_mspel_mc(dst, src, stride, a + (b<<2), rnd, 1); \
}
DECLARE_FUNCTIONS(0, 1)
DECLARE_FUNCTIONS(0, 2)
DECLARE_FUNCTIONS(0, 3)
DECLARE_FUNCTIONS(1, 0)
DECLARE_FUNCTIONS(1, 1)
DECLARE_FUNCTIONS(1, 2)
DECLARE_FUNCTIONS(1, 3)
DECLARE_FUNCTIONS(2, 0)
DECLARE_FUNCTIONS(2, 1)
DECLARE_FUNCTIONS(2, 2)
DECLARE_FUNCTIONS(2, 3)
DECLARE_FUNCTIONS(3, 0)
DECLARE_FUNCTIONS(3, 1)
DECLARE_FUNCTIONS(3, 2)
DECLARE_FUNCTIONS(3, 3)
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
}
void ff_vc1dsp_init_sse2(DSPContext* dsp, AVCodecContext *avctx) {
dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_sse2;
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_sse2;
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_sse2;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_sse2;
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_sse2;
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_sse2;
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_sse2;
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_sse2;
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_sse2;
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_sse2;
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_sse2;
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_sse2;
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_sse2;
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_sse2;
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_sse2;
}
More information about the ffmpeg-devel
mailing list