[FFmpeg-devel] [RFC] DXVA2 decoding and FFmpeg

Timothy Gu timothygu99 at gmail.com
Fri May 29 18:47:58 CEST 2015


On Fri, May 29, 2015 at 03:49:22PM +0200, Stefano Sabatini wrote:
> @@ -405,3 +406,16 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size,
>  
>      return size;
>  }
> +
> +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
> +				   const uint8_t *src, size_t src_linesize,
> +				   unsigned bytewidth, unsigned height,
> +				   int cpu_flags)
> +{
> +#if !HAVE_SSSE3

> +    av_unused(cpu_flags);

av_used has a different definition than VLC_UNUSED. Just use a (void) cast.

> +    av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
> +#else
> +    ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, bytewidth, height, cpu_flags);
> +#endif
> +}
> diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h
> index 23282a3..184e1e7 100644
> --- a/libavutil/imgutils.h
> +++ b/libavutil/imgutils.h
> @@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t       *dst, int dst_linesize,
>                           int bytewidth, int height);
>  
>  /**
> + * Copy image plane from src to dst, similar to av_image_copy_plane().
> + * src must be an USWC buffer.
> + * It performs optimized copy from "Uncacheable Speculative Write
> + * Combining" memory as used by some video surface.
> + * It is really efficient only when SSE4.1 is available.
> + *
> + * In case the target CPU does not support USWC caching this function
> + * will be equivalent to av_image_copy_plane().
> + *
> + * @param cpu_flags as returned by av_get_cpu_flags()
> + * @see av_image_copy_plane()
> + */
> +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
> +                                   const uint8_t *src, size_t src_linesize,
> +                                   unsigned bytewidth, unsigned height,
> +                                   int cpu_flags);
> +
> +/**
>   * Copy image in src_data to dst_data.
>   *
>   * @param dst_linesizes linesizes for the image in dst_data
> diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h
> new file mode 100644
> index 0000000..9576afe
> --- /dev/null
> +++ b/libavutil/imgutils_internal.h
> @@ -0,0 +1,29 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_IMGUTILS_INTERNAL_H
> +#define AVUTIL_IMGUTILS_INTERNAL_H
> +
> +#include "imgutils.h"
> +
> +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
> +				       const uint8_t *src, size_t src_linesize,
> +				       unsigned bytewidth, unsigned height,
> +				       int cpu_flags);
> +
> +#endif /* AVUTIL_IMGUTILS_INTERNAL_H */
> diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
> index eb70a62..a719c00 100644
> --- a/libavutil/x86/Makefile
> +++ b/libavutil/x86/Makefile
> @@ -1,5 +1,6 @@
>  OBJS += x86/cpu.o                                                       \
>          x86/float_dsp_init.o                                            \
> +        x86/imgutils.o                                                  \
>          x86/lls_init.o                                                  \
>  
>  OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o                      \
> diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c
> new file mode 100644
> index 0000000..8b3ed0f
> --- /dev/null
> +++ b/libavutil/x86/imgutils.c
> @@ -0,0 +1,95 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <inttypes.h>
> +#include "config.h"
> +#include "libavutil/avassert.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/imgutils_internal.h"
> +
> +#if HAVE_SSE2
> +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
> + * load and storing data with the SSE>=2 instruction store.
> + */
> +#define COPY16(dstp, srcp, load, store) \
> +    __asm__ volatile (                  \
> +        load "  0(%[src]), %%xmm1\n"    \
> +        store " %%xmm1,    0(%[dst])\n" \
> +        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
> +
> +#define COPY64(dstp, srcp, load, store) \
> +    __asm__ volatile (                  \
> +        load "  0(%[src]), %%xmm1\n"    \
> +        load " 16(%[src]), %%xmm2\n"    \
> +        load " 32(%[src]), %%xmm3\n"    \
> +        load " 48(%[src]), %%xmm4\n"    \
> +        store " %%xmm1,    0(%[dst])\n" \
> +        store " %%xmm2,   16(%[dst])\n" \
> +        store " %%xmm3,   32(%[dst])\n" \
> +        store " %%xmm4,   48(%[dst])\n" \
> +        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
> +#endif
> +
> +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
> +				       const uint8_t *src, size_t src_linesize,
> +				       unsigned bytewidth, unsigned height,
> +				       int cpu_flags)
> +{
> +#if !HAVE_SSSE3

Are any SSSE3 instructions used?

> +    return av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
> +#endif
> +
> +    av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0);
> +
> +    __asm__ volatile ("mfence");
> +
> +    for (unsigned y = 0; y < height; y++) {
> +        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
> +        unsigned x = unaligned;
> +

> +#if HAVE_SSE42
> +        if (cpu_flags & AV_CPU_FLAG_SSE4) {

movntdqa is an SSE4.1 instruction, so this should work better:

    if (INLINE_SSE4(cpu_flags))

That checks both HAVE_SSE4_INLINE and cpu_flags for AV_CPU_FLAG_SSE4.

(But then like others have said new inline asm code shouldn't be added in the
first place)

Timothy


More information about the ffmpeg-devel mailing list