[MPlayer-dev-eng] [PATCH] VF Overlay

Fri Aug 14 10:10:50 CEST 2009

On Sat, Aug 08, 2009 at 04:57:12PM +0200, Benjamin Zores wrote:
> +{
> +    __asm__ volatile(
> +        "pxor    %%mm7, %%mm7                         \n\t"
> +        "pcmpeqb %%mm4, %%mm4                         \n\t"
> +        "movq     (%3), %%mm5                         \n\t"
> +
> +        /* don't apply layer alpha if it's 100% opaque */
> +        "cmp      $255, %4                            \n\t"
> +        "je         0f                                \n\t"
> +
> +        /* Modify alpha from image with layer alpha */
> +        "movq      %%mm5, %%mm6                       \n\t"
> +        "punpcklbw %%mm7, %%mm5                       \n\t"
> +        "punpckhbw %%mm7, %%mm6                       \n\t"
> +        "pmullw "MANGLE(MM_global_alpha)", %%mm5      \n\t"
> +        "pmullw "MANGLE(MM_global_alpha)", %%mm6      \n\t"
> +        "psrlw        $8, %%mm5                       \n\t"
> +        "psrlw        $8, %%mm6                       \n\t"
> +        "packuswb  %%mm6, %%mm5                       \n\t"
> +
> +        "0:                                           \n\t"
> +        "movq      %%mm4, %%mm6                       \n\t"
> +        "psubb     %%mm5, %%mm6                       \n\t"
> +        "movq      %%mm6, (%1)                        \n\t"
> +
> +        /* do alpha * bytes */
> +        "movq       (%2), %%mm0                       \n\t"
> +        "movq      %%mm0, %%mm1                       \n\t"
> +        "punpcklbw %%mm7, %%mm0                       \n\t"
> +        "punpckhbw %%mm7, %%mm1                       \n\t"
> +        "movq      %%mm5, %%mm6                       \n\t"
> +        "punpcklbw %%mm7, %%mm5                       \n\t"
> +        "punpckhbw %%mm7, %%mm6                       \n\t"
> +        "pmullw    %%mm5, %%mm0                       \n\t"
> +        "pmullw    %%mm6, %%mm1                       \n\t"
> +
> +        /* approximate division by 255 */
> +        "movq "MANGLE(MM_ROUND)", %%mm6               \n\t"
> +        "paddw     %%mm6, %%mm0                       \n\t"
> +        "paddw     %%mm6, %%mm1                       \n\t"
> +        "movq      %%mm0, %%mm2                       \n\t"
> +        "movq      %%mm1, %%mm3                       \n\t"
> +        "psrlw        $8, %%mm0                       \n\t"
> +        "psrlw        $8, %%mm1                       \n\t"
> +        "paddw     %%mm2, %%mm0                       \n\t"
> +        "paddw     %%mm3, %%mm1                       \n\t"
> +        "psrlw        $8, %%mm0                       \n\t"
> +        "psrlw        $8, %%mm1                       \n\t"
> +
> +        "packuswb  %%mm1, %%mm0                       \n\t"
> +        "movq      %%mm0, (%0)                        \n\t"
> +        :  "+r" (dst_byte), "+&r" (dst_alpha)
> +        :  "r" (byte), "r" (alpha), "r" (global_alpha)

Ok, this is just wrong. dst_byte and dst_alpha are the _pointers_
and those are never modified, so they should then be input operands.
Also you'll have to add "memory" clobber because gcc can not
know you modified memory.
So I think overall the constraints should look like this:
:
: "r" (dst_byte), "r" (dst_alpha), "r" (byte), "r" (alpha), "r" (global_alpha)
: "memory"

> +            __asm__ volatile(
> +                "xor        %%"REG_c", %%"REG_c"       \n\t"
> +
> +                "1:                                    \n\t"
> +                "movq (%1, %%"REG_c"), %%mm0           \n\t"
> +                "movq           %%mm0, %%mm1           \n\t"
> +                "movq (%3, %%"REG_c"), %%mm2           \n\t"
> +                "movq           %%mm2, %%mm3           \n\t"
> +
> +                "punpcklbw      %%mm7, %%mm0           \n\t"
> +                "punpckhbw      %%mm7, %%mm1           \n\t"
> +                "punpcklbw      %%mm7, %%mm2           \n\t"
> +                "punpckhbw      %%mm7, %%mm3           \n\t"
> +                "pmullw         %%mm2, %%mm0           \n\t"
> +                "pmullw         %%mm3, %%mm1           \n\t"
> +                "paddw          %%mm5, %%mm0           \n\t"
> +                "paddw          %%mm5, %%mm1           \n\t"
> +                "movq           %%mm0, %%mm2           \n\t"
> +                "movq           %%mm1, %%mm3           \n\t"
> +                "psrlw             $8, %%mm0           \n\t"
> +                "psrlw             $8, %%mm1           \n\t"
> +                "paddw          %%mm2, %%mm0           \n\t"
> +                "paddw          %%mm3, %%mm1           \n\t"
> +                "psrlw             $8, %%mm0           \n\t"
> +                "psrlw             $8, %%mm1           \n\t"
> +
> +                "movq (%2, %%"REG_c"), %%mm2           \n\t"
> +                "packuswb       %%mm1, %%mm0           \n\t"
> +                "paddb          %%mm2, %%mm0           \n\t"
> +                "movq           %%mm0, (%0, %%"REG_c") \n\t"
> +
> +                "add               $8, %%"REG_c"       \n\t"
> +                "cmp               %4, %%"REG_c"       \n\t"
> +                "jb                1b                  \n\t"
> +
> +                : "+&r" (dst), "+&r" (src), "+&r" (overlay), "+&r" (alpha)
> +                : "m" (wr)

Can't see why you'd force wr to be in memory, "g" should be a better
constraint.
Also none of the arguments are modified, so again right constraints
should be

:
: "r" (dst), "r" (src), "r" (overlay), "r" (alpha), "g" (wr)
: "%"REG_c, "memory"

> +#if HAVE_MMX
> +    if(gCpuCaps.hasMMX) {
> +        __asm__ volatile(
> +            "pxor              %%mm7, %%mm7      \n\t"
> +            "movq "MANGLE(MM_ROUND)", %%mm5      \n\t"
> +            ::: "memory"
> +        );
> +    }
> +#endif
> +
> +    for (w = priv->mpi_stride, plane = 0; plane < 3; plane++) {
> +        if (plane == 1) {
> +            w >>= 1;
> +            slice_h >>= 1;
> +        }
> +
> +        overlay = overlay_planes[plane];
> +        alpha   = alpha_planes[plane];
> +        src     = src_mpi_planes[plane];
> +        dst     = dst_mpi_planes[plane];
> +
> +        /* Global alpha is 256 which means ignore per-pixel alpha.
> +           Do straight memcpy. */
> +        if (priv->alpha == 256)
> +            memcpy_pic(dst, overlay, w, slice_h,
> +                       dst_mpi->stride[plane], src_mpi->stride[plane]);
> +        else
> +            blend_plane(w, slice_h, dst, src, overlay, alpha,
> +                        src_mpi->stride[plane],
> +                        dst_mpi->stride[plane], overlay_stride[plane]);

Now here we have a real issue. This assumes that gcc will never use MMX,
because that could clobber our carefully set up mm7 and mm5 registers.
IMHO you'll have to set up these registers in the MMX code blocks where
they are used.
But to avoid worse performance that would also mean changing the MMX
code to move the loop into the ASM, so you can set these at the start of
the loop instead for each pixel again.
Though as a simple solution, moving that asm block into blend_plane_MMX
might be good enough, maybe adding a comment that it could be an issue
in theory.