[FFmpeg-devel] [PATCH V1 1/3] lavu: Add alpha blending API based on row.
Rostislav Pehlivanov
atomnuker at gmail.com
Tue Sep 25 22:49:10 EEST 2018
On 25 September 2018 at 16:27, Jun Zhao <mypopydev at gmail.com> wrote:
> Add alpha blending API based on row, support global alpha blending/
> per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
>
> Signed-off-by: Jun Zhao <mypopydev at gmail.com>
> ---
> libavutil/Makefile | 2 +
> libavutil/blend.c | 101 ++++++++++++
> libavutil/blend.h | 47 ++++++
> libavutil/x86/Makefile | 3 +-
> libavutil/x86/blend.h | 32 ++++
> libavutil/x86/blend_init.c | 369 ++++++++++++++++++++++++++++++
> ++++++++++++++
> 6 files changed, 553 insertions(+), 1 deletions(-)
> create mode 100644 libavutil/blend.c
> create mode 100644 libavutil/blend.h
> create mode 100644 libavutil/x86/blend.h
> create mode 100644 libavutil/x86/blend_init.c
>
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 9ed24cf..f1c06e4 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -10,6 +10,7 @@ HEADERS = adler32.h
> \
> avstring.h \
> avutil.h \
> base64.h \
> + blend.h \
> blowfish.h \
> bprint.h \
> bswap.h \
> @@ -95,6 +96,7 @@ OBJS = adler32.o
> \
> audio_fifo.o \
> avstring.o \
> base64.o \
> + blend.o \
> blowfish.o \
> bprint.o \
> buffer.o \
> diff --git a/libavutil/blend.c b/libavutil/blend.c
> new file mode 100644
> index 0000000..e28efa0
> --- /dev/null
> +++ b/libavutil/blend.c
> @@ -0,0 +1,101 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/blend.h"
> +
> +#include "libavutil/x86/blend.h"
> +
> +static void ff_global_blend_row_c(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha, /* XXX: only use
> alpha[0] */
> + uint8_t *dst,
> + int width)
> +{
> + int x;
> + for (x = 0; x < width - 1; x += 2) {
> + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> + dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255)
> >> 8;
> + src0 += 2;
> + src1 += 2;
> + dst += 2;
> + }
> + if (width & 1) {
> + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> + }
> +}
> +
> +void av_global_blend_row(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + blend_row blend_row_fn = NULL;
> +
> +#if ARCH_X86
> + blend_row_fn = ff_blend_row_init_x86(1);
> +#endif
> +
> + if (!blend_row_fn)
> + blend_row_fn = ff_global_blend_row_c;
> +
> + blend_row_fn(src0, src1, alpha, dst, width);
> +}
> +
> +static void ff_per_pixel_blend_row_c(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + int x;
> + for (x = 0; x < width - 1; x += 2) {
> + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> + dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255)
> >> 8;
> + src0 += 2;
> + src1 += 2;
> + dst += 2;
> + alpha+= 2;
> + }
> + if (width & 1) {
> + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255)
> >> 8;
> + }
> +}
> +
> +void av_per_pixel_blend_row(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + blend_row blend_row_fn = NULL;
> +
> +#if ARCH_X86
> + blend_row_fn = ff_blend_row_init_x86(0);
> +#endif
> +
> + if (!blend_row_fn)
> + blend_row_fn = ff_per_pixel_blend_row_c;
> +
> + blend_row_fn(src0, src1, alpha, dst, width);
> +}
> +
> diff --git a/libavutil/blend.h b/libavutil/blend.h
> new file mode 100644
> index 0000000..8a42109
> --- /dev/null
> +++ b/libavutil/blend.h
> @@ -0,0 +1,47 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +#ifndef AVUTIL_BLEND_H
> +#define AVUTIL_BLEND_H
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +
> +/**
> + * Global alpha blending by row
> + *
> + * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8
> + */
> +void av_global_blend_row(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha, /* XXX: only use alpha[0]
> */
> + uint8_t *dst,
> + int width);
> +
> +/**
> + * Per-pixel alpha blending by row
> + *
> + * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8
> + */
> +void av_per_pixel_blend_row(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width);
> +#endif
> diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
> index 5f5242b..1e5e3e4 100644
> --- a/libavutil/x86/Makefile
> +++ b/libavutil/x86/Makefile
> @@ -1,4 +1,5 @@
> -OBJS += x86/cpu.o \
> +OBJS += x86/blend_init.o \
> + x86/cpu.o \
> x86/fixed_dsp_init.o \
> x86/float_dsp_init.o \
> x86/imgutils_init.o \
> diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h
> new file mode 100644
> index 0000000..9fa0f36
> --- /dev/null
> +++ b/libavutil/x86/blend.h
> @@ -0,0 +1,32 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_X86_BLEND_H
> +#define AVUTIL_X86_BLEND_H
> +
> +#include "libavutil/blend.h"
> +
> +typedef void (*blend_row)(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width);
> +
> +blend_row ff_blend_row_init_x86(int global);
> +
> +#endif /* AVUTIL_X86_BLEND_H */
> diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c
> new file mode 100644
> index 0000000..f555dfa
> --- /dev/null
> +++ b/libavutil/x86/blend_init.c
> @@ -0,0 +1,369 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/blend.h"
> +
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> +// per-pixel blend (8 pixels at a time.)
> +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
> +static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + int aligned_w = width/8 * 8;
> + int width_u = width - aligned_w;
> + uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> + uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> + uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
> + uint8_t *dst_u = dst + aligned_w;
> + int i;
> +
> + if (aligned_w > 0) {
> + __asm__ volatile(
> + "pcmpeqb %%xmm3,%%xmm3 \n\t"
> + "psllw $0x8,%%xmm3 \n\t"
> + "mov $0x80808080,%%eax \n\t"
> + "movd %%eax,%%xmm3 \n\t"
> + "pshufd $0x0,%%xmm4,%%xmm4 \n\t"
> + "mov $0x807f807f,%%eax \n\t"
> + "movd %%eax,%%xmm5 \n\t"
> + "pshufd $0x0,%%xmm5,%%xmm5 \n\t"
> + "sub %2,%0 \n\t"
> + "sub %2,%1 \n\t"
> + "sub %2,%3 \n\t"
> +
> + // 8 pixel per loop.
> + "1: \n\t"
> + "movq (%2),%%xmm0 \n\t"
> + "punpcklbw %%xmm0,%%xmm0 \n\t"
> + "pxor %%xmm3,%%xmm0 \n\t"
> + "movq (%0,%2,1),%%xmm1 \n\t"
> + "movq (%1,%2,1),%%xmm2 \n\t"
> + "punpcklbw %%xmm2,%%xmm1 \n\t"
> + "psubb %%xmm4,%%xmm1 \n\t"
> + "pmaddubsw %%xmm1,%%xmm0 \n\t"
> + "paddw %%xmm5,%%xmm0 \n\t"
> + "psrlw $0x8,%%xmm0 \n\t"
> + "packuswb %%xmm0,%%xmm0 \n\t"
> + "movq %%xmm0,(%3,%2,1) \n\t"
> + "lea 0x8(%2),%2 \n\t"
> + "sub $0x8,%4 \n\t"
> + "jg 1b \n\t"
> + : "+r"(src0), // %0
> + "+r"(src1), // %1
> + "+r"(alpha), // %2
> + "+r"(dst), // %3
> + "+rm"(aligned_w) // %4
> + ::"memory",
> + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
> + }
> +
> + for (i = 0; i < width_u - 1; i += 2) {
> + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> + dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 -
> alpha_u[0]) + 255) >> 8;
> + src0_u += 2;
> + src1_u += 2;
> + dst_u += 2;
> + alpha_u+= 2;
> + }
> + if (width_u & 1) {
> + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> + }
> +}
> +
> +// global blend (8 pixels at a time).
> +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
> +static void ff_global_blend_row_ssse3(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + int aligned_w = width/8 * 8;
> + int width_u = width - aligned_w;
> + uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> + uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> + uint8_t *dst_u = dst + aligned_w;
> + int i;
> +
> + if (aligned_w > 0) {
> + __asm__ volatile(
> + "pcmpeqb %%xmm3,%%xmm3 \n\t"
> + "psllw $0x8,%%xmm3 \n\t"
> + "mov $0x80808080,%%eax \n\t"
> + "movd %%eax,%%xmm4 \n\t"
> + "pshufd $0x0,%%xmm4,%%xmm4 \n\t"
> + "mov $0x807f807f,%%eax \n\t"
> + "movd %%eax,%%xmm5 \n\t"
> + "pshufd $0x0,%%xmm5,%%xmm5 \n\t"
> + // a => xmm6 [a a a a a a a a a a a a a a a a ]
> + "movb (%2),%%al \n\t"
> + "movd %%eax,%%xmm6 \n\t" // xmm6 = x
> x x x x x x x x x x x x x x a
> + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x
> x x x x x x x x x x x x x a a
> + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x
> x x x x x x x x x x x a a a a
> + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x
> x x x x x x x a a a a a a a a
> + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = a
> a a a a a a a a a a a a a a a
> +
> + // 8 pixel per loop.
> + "1: \n\t"
> + "movdqu %%xmm6,%%xmm0 \n\t" // xmm0 =
> xmm6
> + "pxor %%xmm3,%%xmm0 \n\t"
> +
> + "movq (%0),%%xmm1 \n\t"
> + "movq (%1),%%xmm2 \n\t"
> + "punpcklbw %%xmm2,%%xmm1 \n\t"
> + "psubb %%xmm4,%%xmm1 \n\t"
> +
> + "pmaddubsw %%xmm1,%%xmm0 \n\t"
> + "paddw %%xmm5,%%xmm0 \n\t"
> + "psrlw $0x8,%%xmm0 \n\t"
> + "packuswb %%xmm0,%%xmm0 \n\t"
> + "movq %%xmm0,(%3) \n\t"
> +
> + "lea 0x8(%0),%0 \n\t" // src0+8
> + "lea 0x8(%1),%1 \n\t" // src1+8
> + "lea 0x8(%3),%3 \n\t" // dst+8
> + "sub $0x8,%4 \n\t"
> + "jg 1b \n\t"
> + : "+r"(src0), // %0
> + "+r"(src1), // %1
> + "+r"(alpha), // %2
> + "+r"(dst), // %3
> + "+rm"(aligned_w) // %4
> + ::"memory",
> + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6");
> + }
> +
> + for (i = 0; i < width_u - 1; i += 2) {
> + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> + dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) +
> 255) >> 8;
> + src0_u += 2;
> + src1_u += 2;
> + dst_u += 2;
> + }
> + if (width_u & 1) {
> + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> + }
> +}
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> +// per-pixe blend (32 pixels at a time).
> +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
> +static void ff_per_pixel_blend_row_avx2(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + int aligned_w = width/32 * 32;
> + int width_u = width - aligned_w;
> + uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> + uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> + uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
> + uint8_t *dst_u = dst + aligned_w;
> + int i;
> +
> + if (aligned_w > 0) {
> + __asm__ volatile(
> + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t"
> + "vpsllw $0x8,%%ymm5,%%ymm5 \n\t"
> + "mov $0x80808080,%%eax \n\t"
> + "vmovd %%eax,%%xmm6 \n\t"
> + "vbroadcastss %%xmm6,%%ymm6 \n\t"
> + "mov $0x807f807f,%%eax \n\t"
> + "vmovd %%eax,%%xmm7 \n\t"
> + "vbroadcastss %%xmm7,%%ymm7 \n\t"
> + "sub %2,%0 \n\t"
> + "sub %2,%1 \n\t"
> + "sub %2,%3 \n\t"
> +
> + // 32 pixel per loop.
> + "1: \n\t"
> + "vmovdqu (%2),%%ymm0 \n\t"
> + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t"
> + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t"
> + "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t"
> + "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t"
> + "vmovdqu (%0,%2,1),%%ymm1 \n\t"
> + "vmovdqu (%1,%2,1),%%ymm2 \n\t"
> + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t"
> + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t"
> + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t"
> + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t"
> + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t"
> + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t"
> + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t"
> + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t"
> + "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t"
> + "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t"
> + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t"
> + "vmovdqu %%ymm0,(%3,%2,1) \n\t"
> + "lea 0x20(%2),%2 \n\t"
> + "sub $0x20,%4 \n\t"
> + "jg 1b \n\t"
> + "vzeroupper \n\t"
> + : "+r"(src0), // %0
> + "+r"(src1), // %1
> + "+r"(alpha), // %2
> + "+r"(dst), // %3
> + "+rm"(aligned_w) // %4
> + ::"memory",
> + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6",
> + "xmm7");
> + }
> +
> + for (i = 0; i < width_u - 1; i += 2) {
> + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> + dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 -
> alpha_u[0]) + 255) >> 8;
> + src0_u += 2;
> + src1_u += 2;
> + dst_u += 2;
> + alpha_u+= 2;
> + }
> + if (width_u & 1) {
> + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 -
> alpha_u[0]) + 255) >> 8;
> + }
> +}
> +
> +// global blend (32 pixels at a time)
> +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
> +static void ff_global_blend_row_avx2(const uint8_t *src0,
> + const uint8_t *src1,
> + const uint8_t *alpha,
> + uint8_t *dst,
> + int width)
> +{
> + int aligned_w = width/32 * 32;
> + int width_u = width - aligned_w;
> + uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
> + uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
> + uint8_t *dst_u = dst + aligned_w;
> + int i;
> +
> + if (aligned_w > 0) {
> + __asm__ volatile(
> + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t"
> + "vpsllw $0x8,%%ymm5,%%ymm5 \n\t"
> + "mov $0x80808080,%%eax \n\t"
> + "vmovd %%eax,%%xmm6 \n\t"
> + "vbroadcastss %%xmm6,%%ymm6 \n\t"
> + "mov $0x807f807f,%%eax \n\t"
> + "vmovd %%eax,%%xmm7 \n\t"
> + "vbroadcastss %%xmm7,%%ymm7 \n\t"
> + // a => ymm8 [a a a a a a a a a a a a a a a a
> + // a a a a a a a a a a a a a a a a
> + // a a a a a a a a a a a a a a a a
> + // a a a a a a a a a a a a a a a a]
> + "movb (%2),%%al \n\t"
> + "movd %%eax,%%xmm8 \n\t" // xmm8 = x
> x x x x x x x x x x x x x x a
> + "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x
> x x x x x x x x x x x x x a a
> + "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x
> x x x x x x x x x x x a a a a
> + "vbroadcastss %%xmm8,%%ymm8 \n\t"
> +
> + // 32 pixel per loop.
> + "1: \n\t"
> + "vmovdqu %%ymm8,%%ymm0 \n\t"
> + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t"
> + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t"
> + "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t"
> + "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t"
> +
> + "vmovdqu (%0),%%ymm1 \n\t"
> + "vmovdqu (%1),%%ymm2 \n\t"
> + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t"
> + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t"
> + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t"
> + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t"
> + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t"
> + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t"
> + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t"
> + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t"
> + "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t"
> + "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t"
> + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t"
> +
> + "vmovdqu %%ymm0,(%3) \n\t"
> + "lea 0x20(%0),%0 \n\t"
> + "lea 0x20(%1),%1 \n\t"
> + "lea 0x20(%3),%3 \n\t"
> + "sub $0x20,%4 \n\t"
> + "jg 1b \n\t"
> + "vzeroupper \n\t"
> + : "+r"(src0), // %0
> + "+r"(src1), // %1
> + "+r"(alpha), // %2
> + "+r"(dst), // %3
> + "+rm"(aligned_w) // %4
> + ::"memory",
> + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
> "xmm6",
> + "xmm7", "xmm8");
> + }
> +
> + for (i = 0; i < width_u - 1; i += 2) {
> + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> + dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) +
> 255) >> 8;
> + src0_u += 2;
> + src1_u += 2;
> + dst_u += 2;
> + }
> + if (width_u & 1) {
> + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) +
> 255) >> 8;
> + }
> +}
> +#endif
> +
> +av_cold blend_row ff_blend_row_init_x86(int global)
> +{
> + blend_row blend_row_fn = NULL;
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (global) {
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> + if (EXTERNAL_SSSE3(cpu_flags)) {
> + blend_row_fn = ff_global_blend_row_ssse3;
> + }
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> + if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> + blend_row_fn = ff_global_blend_row_avx2;
> + }
> +#endif
> + } else {
> +#if HAVE_SSSE3_INLINE && HAVE_6REGS
> + if (EXTERNAL_SSSE3(cpu_flags)) {
> + blend_row_fn = ff_per_pixel_blend_row_ssse3;
> + }
> +#endif
> +
> +#if HAVE_AVX2_INLINE && HAVE_6REGS
> + if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> + blend_row_fn = ff_per_pixel_blend_row_avx2;
> + }
> +#endif
> + }
> +
> + return blend_row_fn;
> +}
> --
> 1.7.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
We don't use inline asm on x86 and we don't use global contexts. Look at
how float_dsp is done.
More information about the ffmpeg-devel
mailing list