[FFmpeg-devel] [PATCH] faster vp6 decoding

Wed Feb 11 16:28:45 CET 2009

Hi,

2009/2/9 Jason Garrett-Glaser <darkshikari at gmail.com>:
> +    "punpcklbw %%mm7, %%mm0\n\t"                                \
> +    "punpcklbw %%mm7, %%mm1\n\t"                                \
> +    "punpckhbw %%mm7, %%mm3\n\t"                                \
> +    "punpckhbw %%mm7, %%mm4\n\t"                                \
> +    "pmullw  0(%2), %%mm0\n\t" /* src[x-8 ] * biweight [0] */   \
> +    "pmullw  8(%2), %%mm1\n\t" /* src[x   ] * biweight [1] */   \
> +    "pmullw  0(%2), %%mm3\n\t" /* src[x-8 ] * biweight [0] */   \
> +    "pmullw  8(%2), %%mm4\n\t" /* src[x   ] * biweight [1] */   \
> +    "paddw %%mm1, %%mm0\n\t"                                    \
> +    "paddw %%mm4, %%mm3\n\t"                                    \
>
> This can be done faster with pmaddubsw (SSSE3-only, but worth making
> another version surely).

Sure but that would require weights to be stored as arrays of int8_t
instead of int16_t?

> Worthwhile if you make an SSE version.

SSE2?

> Works by interleaving the weights, allowing you to avoid the unpacks,
> use only two multiplies, and avoid the adds, too, I think.  If I'm
> right, that makes the entire thing quite a bit less than half the
> instructions.

I tried something like below and it's about 15% faster on my Pentium
M. The speed up should be more prominent on modern CPUs with 128 bit
FADD unit:

#define DIAG4_SSE2(in1,in2,in3,in4)                             \
    "movq "#in1"(%0), %%xmm0\n\t"                                \
    "movq "#in2"(%0), %%xmm1\n\t"                                \
    "punpcklbw %%xmm7, %%xmm0\n\t"                                \
    "punpcklbw %%xmm7, %%xmm1\n\t"                                \
    "pmullw  %%xmm4, %%xmm0\n\t" /* src[x-8 ] * biweight [0] */   \
    "pmullw  %%xmm5, %%xmm1\n\t" /* src[x   ] * biweight [1] */   \
    "paddw %%xmm1, %%xmm0\n\t"                                    \
    "movq  "#in3"(%0), %%xmm1\n\t"                               \
    "movq  "#in4"(%0), %%xmm2\n\t"                               \
    "punpcklbw %%xmm7, %%xmm1\n\t"                                \
    "punpcklbw %%xmm7, %%xmm2\n\t"                                \
    "pmullw %%xmm6, %%xmm1\n\t" /* src[x+8 ] * biweight [2] */   \
    "pmullw %%xmm3, %%xmm2\n\t" /* src[x+16] * biweight [3] */   \
    "paddw %%xmm2, %%xmm1\n\t"                                    \
    "paddw %%xmm1, %%xmm0\n\t"                                    \
    "paddw _ff_diag4_round, %%xmm0\n\t" /* Add 64 */             \
    "psrlw $7, %%xmm0\n\t"                                       \
    "packuswb %%xmm0, %%xmm0\n\t"                                 \
    "movq %%xmm0, (%1)\n\t"

static void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
                             const int16_t *h_weights,const int16_t *v_weights)
{
    uint8_t tmp[8*11];
    uint8_t *t = tmp;
    src -= stride;

    asm (
    "pxor %%xmm7, %%xmm7\n\t"
    "movq %4, %%xmm3\n\t"
    "pshuflw $0, %%xmm3, %%xmm4\n\t"
    "punpcklqdq %%xmm4, %%xmm4\n\t"
    "pshuflw $85, %%xmm3, %%xmm5\n\t"
    "punpcklqdq %%xmm5, %%xmm5\n\t"
    "pshuflw $170, %%xmm3, %%xmm6\n\t"
    "punpcklqdq %%xmm6, %%xmm6\n\t"
    "pshuflw $255, %%xmm3, %%xmm3\n\t"
    "punpcklqdq %%xmm3, %%xmm3\n\t"
    "1:\n\t"
    DIAG4_SSE2(-1,0,1,2)
    "addl $8, %1\n\t"
    "addl %2, %0\n\t"
    "decl %3\n\t"
    "jnz 1b\n\t"
            :
            : "r" (src), "r" (t), "g" (stride), "r" (11),
"m"(*(int64_t*)h_weights)
            : "memory"
            );

    t = tmp + 8;

    asm (
    "movq %4, %%xmm3\n\t"
    "pshuflw $0, %%xmm3, %%xmm4\n\t"
    "punpcklqdq %%xmm4, %%xmm4\n\t"
    "pshuflw $85, %%xmm3, %%xmm5\n\t"
    "punpcklqdq %%xmm5, %%xmm5\n\t"
    "pshuflw $170, %%xmm3, %%xmm6\n\t"
    "punpcklqdq %%xmm6, %%xmm6\n\t"
    "pshuflw $255, %%xmm3, %%xmm3\n\t"
    "punpcklqdq %%xmm3, %%xmm3\n\t"
    "1:\n\t"
    DIAG4_SSE2(-8,0,8,16)
    "addl $8, %0\n\t"
    "addl %2, %1\n\t"
    "decl %3\n\t"
    "jnz 1b\n\t"
            :
            : "r" (t), "r" (dst), "g" (stride), "r" (8),
"m"(*(int64_t*)v_weights)
            : "memory"
    );
}

-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6