[FFmpeg-devel] [PATCH V1 1/3] lavu: Add alpha blending API based on row.
Jun Zhao
mypopydev at gmail.com
Tue Sep 25 18:27:13 EEST 2018
Add alpha blending API based on row, support global alpha blending/
per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.
Signed-off-by: Jun Zhao <mypopydev at gmail.com>
---
libavutil/Makefile | 2 +
libavutil/blend.c | 101 ++++++++++++
libavutil/blend.h | 47 ++++++
libavutil/x86/Makefile | 3 +-
libavutil/x86/blend.h | 32 ++++
libavutil/x86/blend_init.c | 369 ++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 553 insertions(+), 1 deletions(-)
create mode 100644 libavutil/blend.c
create mode 100644 libavutil/blend.h
create mode 100644 libavutil/x86/blend.h
create mode 100644 libavutil/x86/blend_init.c
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 9ed24cf..f1c06e4 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -10,6 +10,7 @@ HEADERS = adler32.h \
avstring.h \
avutil.h \
base64.h \
+ blend.h \
blowfish.h \
bprint.h \
bswap.h \
@@ -95,6 +96,7 @@ OBJS = adler32.o \
audio_fifo.o \
avstring.o \
base64.o \
+ blend.o \
blowfish.o \
bprint.o \
buffer.o \
diff --git a/libavutil/blend.c b/libavutil/blend.c
new file mode 100644
index 0000000..e28efa0
--- /dev/null
+++ b/libavutil/blend.c
@@ -0,0 +1,101 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/blend.h"
+
+#include "libavutil/x86/blend.h"
+
+static void ff_global_blend_row_c(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha, /* XXX: only use alpha[0] */
+ uint8_t *dst,
+ int width)
+{
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+ dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+ src0 += 2;
+ src1 += 2;
+ dst += 2;
+ }
+ if (width & 1) {
+ dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+ }
+}
+
+void av_global_blend_row(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+ blend_row_fn = ff_blend_row_init_x86(1);
+#endif
+
+ if (!blend_row_fn)
+ blend_row_fn = ff_global_blend_row_c;
+
+ blend_row_fn(src0, src1, alpha, dst, width);
+}
+
+static void ff_per_pixel_blend_row_c(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+ dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+ src0 += 2;
+ src1 += 2;
+ dst += 2;
+ alpha+= 2;
+ }
+ if (width & 1) {
+ dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+ }
+}
+
+void av_per_pixel_blend_row(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+ blend_row_fn = ff_blend_row_init_x86(0);
+#endif
+
+ if (!blend_row_fn)
+ blend_row_fn = ff_per_pixel_blend_row_c;
+
+ blend_row_fn(src0, src1, alpha, dst, width);
+}
+
diff --git a/libavutil/blend.h b/libavutil/blend.h
new file mode 100644
index 0000000..8a42109
--- /dev/null
+++ b/libavutil/blend.h
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef AVUTIL_BLEND_H
+#define AVUTIL_BLEND_H
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+
+/**
+ * Global alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8
+ */
+void av_global_blend_row(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha, /* XXX: only use alpha[0] */
+ uint8_t *dst,
+ int width);
+
+/**
+ * Per-pixel alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8
+ */
+void av_per_pixel_blend_row(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width);
+#endif
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 5f5242b..1e5e3e4 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,4 +1,5 @@
-OBJS += x86/cpu.o \
+OBJS += x86/blend_init.o \
+ x86/cpu.o \
x86/fixed_dsp_init.o \
x86/float_dsp_init.o \
x86/imgutils_init.o \
diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h
new file mode 100644
index 0000000..9fa0f36
--- /dev/null
+++ b/libavutil/x86/blend.h
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_BLEND_H
+#define AVUTIL_X86_BLEND_H
+
+#include "libavutil/blend.h"
+
+typedef void (*blend_row)(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width);
+
+blend_row ff_blend_row_init_x86(int global);
+
+#endif /* AVUTIL_X86_BLEND_H */
diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c
new file mode 100644
index 0000000..f555dfa
--- /dev/null
+++ b/libavutil/x86/blend_init.c
@@ -0,0 +1,369 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/blend.h"
+
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+// per-pixel blend (8 pixels at a time.)
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ int aligned_w = width/8 * 8;
+ int width_u = width - aligned_w;
+ uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+ uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+ uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+ uint8_t *dst_u = dst + aligned_w;
+ int i;
+
+ if (aligned_w > 0) {
+ __asm__ volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n\t"
+ "psllw $0x8,%%xmm3 \n\t"
+ "mov $0x80808080,%%eax \n\t"
+ "movd %%eax,%%xmm3 \n\t"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n\t"
+ "mov $0x807f807f,%%eax \n\t"
+ "movd %%eax,%%xmm5 \n\t"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n\t"
+ "sub %2,%0 \n\t"
+ "sub %2,%1 \n\t"
+ "sub %2,%3 \n\t"
+
+ // 8 pixel per loop.
+ "1: \n\t"
+ "movq (%2),%%xmm0 \n\t"
+ "punpcklbw %%xmm0,%%xmm0 \n\t"
+ "pxor %%xmm3,%%xmm0 \n\t"
+ "movq (%0,%2,1),%%xmm1 \n\t"
+ "movq (%1,%2,1),%%xmm2 \n\t"
+ "punpcklbw %%xmm2,%%xmm1 \n\t"
+ "psubb %%xmm4,%%xmm1 \n\t"
+ "pmaddubsw %%xmm1,%%xmm0 \n\t"
+ "paddw %%xmm5,%%xmm0 \n\t"
+ "psrlw $0x8,%%xmm0 \n\t"
+ "packuswb %%xmm0,%%xmm0 \n\t"
+ "movq %%xmm0,(%3,%2,1) \n\t"
+ "lea 0x8(%2),%2 \n\t"
+ "sub $0x8,%4 \n\t"
+ "jg 1b \n\t"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(aligned_w) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ }
+
+ for (i = 0; i < width_u - 1; i += 2) {
+ dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+ dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+ src0_u += 2;
+ src1_u += 2;
+ dst_u += 2;
+ alpha_u+= 2;
+ }
+ if (width_u & 1) {
+ dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+ }
+}
+
+// global blend (8 pixels at a time).
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_ssse3(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ int aligned_w = width/8 * 8;
+ int width_u = width - aligned_w;
+ uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+ uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+ uint8_t *dst_u = dst + aligned_w;
+ int i;
+
+ if (aligned_w > 0) {
+ __asm__ volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n\t"
+ "psllw $0x8,%%xmm3 \n\t"
+ "mov $0x80808080,%%eax \n\t"
+ "movd %%eax,%%xmm4 \n\t"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n\t"
+ "mov $0x807f807f,%%eax \n\t"
+ "movd %%eax,%%xmm5 \n\t"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n\t"
+ // a => xmm6 [a a a a a a a a a a a a a a a a ]
+ "movb (%2),%%al \n\t"
+ "movd %%eax,%%xmm6 \n\t" // xmm6 = x x x x x x x x x x x x x x x a
+ "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x x x x x x x x x x x x x x a a
+ "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x x x x x x x x x x x x a a a a
+ "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x x x x x x x x a a a a a a a a
+ "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = a a a a a a a a a a a a a a a a
+
+ // 8 pixel per loop.
+ "1: \n\t"
+ "movdqu %%xmm6,%%xmm0 \n\t" // xmm0 = xmm6
+ "pxor %%xmm3,%%xmm0 \n\t"
+
+ "movq (%0),%%xmm1 \n\t"
+ "movq (%1),%%xmm2 \n\t"
+ "punpcklbw %%xmm2,%%xmm1 \n\t"
+ "psubb %%xmm4,%%xmm1 \n\t"
+
+ "pmaddubsw %%xmm1,%%xmm0 \n\t"
+ "paddw %%xmm5,%%xmm0 \n\t"
+ "psrlw $0x8,%%xmm0 \n\t"
+ "packuswb %%xmm0,%%xmm0 \n\t"
+ "movq %%xmm0,(%3) \n\t"
+
+ "lea 0x8(%0),%0 \n\t" // src0+8
+ "lea 0x8(%1),%1 \n\t" // src1+8
+ "lea 0x8(%3),%3 \n\t" // dst+8
+ "sub $0x8,%4 \n\t"
+ "jg 1b \n\t"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(aligned_w) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ }
+
+ for (i = 0; i < width_u - 1; i += 2) {
+ dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+ dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+ src0_u += 2;
+ src1_u += 2;
+ dst_u += 2;
+ }
+ if (width_u & 1) {
+ dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+ }
+}
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+// per-pixe blend (32 pixels at a time).
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_avx2(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ int aligned_w = width/32 * 32;
+ int width_u = width - aligned_w;
+ uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+ uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+ uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+ uint8_t *dst_u = dst + aligned_w;
+ int i;
+
+ if (aligned_w > 0) {
+ __asm__ volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n\t"
+ "mov $0x80808080,%%eax \n\t"
+ "vmovd %%eax,%%xmm6 \n\t"
+ "vbroadcastss %%xmm6,%%ymm6 \n\t"
+ "mov $0x807f807f,%%eax \n\t"
+ "vmovd %%eax,%%xmm7 \n\t"
+ "vbroadcastss %%xmm7,%%ymm7 \n\t"
+ "sub %2,%0 \n\t"
+ "sub %2,%1 \n\t"
+ "sub %2,%3 \n\t"
+
+ // 32 pixel per loop.
+ "1: \n\t"
+ "vmovdqu (%2),%%ymm0 \n\t"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t"
+ "vmovdqu (%0,%2,1),%%ymm1 \n\t"
+ "vmovdqu (%1,%2,1),%%ymm2 \n\t"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t"
+ "vmovdqu %%ymm0,(%3,%2,1) \n\t"
+ "lea 0x20(%2),%2 \n\t"
+ "sub $0x20,%4 \n\t"
+ "jg 1b \n\t"
+ "vzeroupper \n\t"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(aligned_w) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+ }
+
+ for (i = 0; i < width_u - 1; i += 2) {
+ dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+ dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+ src0_u += 2;
+ src1_u += 2;
+ dst_u += 2;
+ alpha_u+= 2;
+ }
+ if (width_u & 1) {
+ dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+ }
+}
+
+// global blend (32 pixels at a time)
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_avx2(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *alpha,
+ uint8_t *dst,
+ int width)
+{
+ int aligned_w = width/32 * 32;
+ int width_u = width - aligned_w;
+ uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+ uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+ uint8_t *dst_u = dst + aligned_w;
+ int i;
+
+ if (aligned_w > 0) {
+ __asm__ volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n\t"
+ "mov $0x80808080,%%eax \n\t"
+ "vmovd %%eax,%%xmm6 \n\t"
+ "vbroadcastss %%xmm6,%%ymm6 \n\t"
+ "mov $0x807f807f,%%eax \n\t"
+ "vmovd %%eax,%%xmm7 \n\t"
+ "vbroadcastss %%xmm7,%%ymm7 \n\t"
+ // a => ymm8 [a a a a a a a a a a a a a a a a
+ // a a a a a a a a a a a a a a a a
+ // a a a a a a a a a a a a a a a a
+ // a a a a a a a a a a a a a a a a]
+ "movb (%2),%%al \n\t"
+ "movd %%eax,%%xmm8 \n\t" // xmm8 = x x x x x x x x x x x x x x x a
+ "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x x x x x x x x x x x x x x a a
+ "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x x x x x x x x x x x x a a a a
+ "vbroadcastss %%xmm8,%%ymm8 \n\t"
+
+ // 32 pixel per loop.
+ "1: \n\t"
+ "vmovdqu %%ymm8,%%ymm0 \n\t"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t"
+
+ "vmovdqu (%0),%%ymm1 \n\t"
+ "vmovdqu (%1),%%ymm2 \n\t"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t"
+
+ "vmovdqu %%ymm0,(%3) \n\t"
+ "lea 0x20(%0),%0 \n\t"
+ "lea 0x20(%1),%1 \n\t"
+ "lea 0x20(%3),%3 \n\t"
+ "sub $0x20,%4 \n\t"
+ "jg 1b \n\t"
+ "vzeroupper \n\t"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(aligned_w) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8");
+ }
+
+ for (i = 0; i < width_u - 1; i += 2) {
+ dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+ dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+ src0_u += 2;
+ src1_u += 2;
+ dst_u += 2;
+ }
+ if (width_u & 1) {
+ dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+ }
+}
+#endif
+
+av_cold blend_row ff_blend_row_init_x86(int global)
+{
+ blend_row blend_row_fn = NULL;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (global) {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ blend_row_fn = ff_global_blend_row_ssse3;
+ }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ blend_row_fn = ff_global_blend_row_avx2;
+ }
+#endif
+ } else {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ blend_row_fn = ff_per_pixel_blend_row_ssse3;
+ }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ blend_row_fn = ff_per_pixel_blend_row_avx2;
+ }
+#endif
+ }
+
+ return blend_row_fn;
+}
--
1.7.1
More information about the ffmpeg-devel
mailing list