[FFmpeg-devel] [PATCH V1 1/3] lavu: Add alpha blending API based on row.

Jun Zhao mypopydev at gmail.com
Tue Sep 25 18:27:13 EEST 2018


Add alpha blending API based on row, support global alpha blending/
per-pixel blending, and add SSSE3/AVX2 optimizations of the functions.

Signed-off-by: Jun Zhao <mypopydev at gmail.com>
---
 libavutil/Makefile         |    2 +
 libavutil/blend.c          |  101 ++++++++++++
 libavutil/blend.h          |   47 ++++++
 libavutil/x86/Makefile     |    3 +-
 libavutil/x86/blend.h      |   32 ++++
 libavutil/x86/blend_init.c |  369 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 553 insertions(+), 1 deletions(-)
 create mode 100644 libavutil/blend.c
 create mode 100644 libavutil/blend.h
 create mode 100644 libavutil/x86/blend.h
 create mode 100644 libavutil/x86/blend_init.c

diff --git a/libavutil/Makefile b/libavutil/Makefile
index 9ed24cf..f1c06e4 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -10,6 +10,7 @@ HEADERS = adler32.h                                                     \
           avstring.h                                                    \
           avutil.h                                                      \
           base64.h                                                      \
+          blend.h                                                       \
           blowfish.h                                                    \
           bprint.h                                                      \
           bswap.h                                                       \
@@ -95,6 +96,7 @@ OBJS = adler32.o                                                        \
        audio_fifo.o                                                     \
        avstring.o                                                       \
        base64.o                                                         \
+       blend.o                                                          \
        blowfish.o                                                       \
        bprint.o                                                         \
        buffer.o                                                         \
diff --git a/libavutil/blend.c b/libavutil/blend.c
new file mode 100644
index 0000000..e28efa0
--- /dev/null
+++ b/libavutil/blend.c
@@ -0,0 +1,101 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/blend.h"
+
+#include "libavutil/x86/blend.h"
+
+static void ff_global_blend_row_c(const uint8_t *src0,
+                                  const uint8_t *src1,
+                                  const uint8_t *alpha, /* XXX: only use alpha[0] */
+                                  uint8_t *dst,
+                                  int width)
+{
+    int x;
+    for (x = 0; x < width - 1; x += 2) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+        src0 += 2;
+        src1 += 2;
+        dst  += 2;
+    }
+    if (width & 1) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+
+void av_global_blend_row(const uint8_t *src0,
+                         const uint8_t *src1,
+                         const uint8_t *alpha,
+                         uint8_t *dst,
+                         int width)
+{
+    blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+    blend_row_fn = ff_blend_row_init_x86(1);
+#endif
+
+    if (!blend_row_fn)
+        blend_row_fn = ff_global_blend_row_c;
+
+    blend_row_fn(src0, src1, alpha, dst, width);
+}
+
+static void ff_per_pixel_blend_row_c(const uint8_t *src0,
+                                     const uint8_t *src1,
+                                     const uint8_t *alpha,
+                                     uint8_t *dst,
+                                     int width)
+{
+    int x;
+    for (x = 0; x < width - 1; x += 2) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+        dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) >> 8;
+        src0 += 2;
+        src1 += 2;
+        dst  += 2;
+        alpha+= 2;
+    }
+    if (width & 1) {
+        dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+
+void av_per_pixel_blend_row(const uint8_t *src0,
+                            const uint8_t *src1,
+                            const uint8_t *alpha,
+                            uint8_t *dst,
+                            int width)
+{
+    blend_row blend_row_fn = NULL;
+
+#if ARCH_X86
+    blend_row_fn = ff_blend_row_init_x86(0);
+#endif
+
+    if (!blend_row_fn)
+        blend_row_fn = ff_per_pixel_blend_row_c;
+
+    blend_row_fn(src0, src1, alpha, dst, width);
+}
+
diff --git a/libavutil/blend.h b/libavutil/blend.h
new file mode 100644
index 0000000..8a42109
--- /dev/null
+++ b/libavutil/blend.h
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef AVUTIL_BLEND_H
+#define AVUTIL_BLEND_H
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+
+/**
+ * Global alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8
+ */
+void av_global_blend_row(const uint8_t *src0,
+                         const uint8_t *src1,
+                         const uint8_t *alpha, /* XXX: only use alpha[0] */
+                         uint8_t *dst,
+                         int width);
+
+/**
+ * Per-pixel alpha blending by row
+ *
+ * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8
+ */
+void av_per_pixel_blend_row(const uint8_t *src0,
+                            const uint8_t *src1,
+                            const uint8_t *alpha,
+                            uint8_t *dst,
+                            int width);
+#endif
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 5f5242b..1e5e3e4 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,4 +1,5 @@
-OBJS += x86/cpu.o                                                       \
+OBJS += x86/blend_init.o                                                \
+        x86/cpu.o                                                       \
         x86/fixed_dsp_init.o                                            \
         x86/float_dsp_init.o                                            \
         x86/imgutils_init.o                                             \
diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h
new file mode 100644
index 0000000..9fa0f36
--- /dev/null
+++ b/libavutil/x86/blend.h
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_BLEND_H
+#define AVUTIL_X86_BLEND_H
+
+#include "libavutil/blend.h"
+
+typedef void (*blend_row)(const uint8_t *src0,
+                          const uint8_t *src1,
+                          const uint8_t *alpha,
+                          uint8_t *dst,
+                          int width);
+
+blend_row ff_blend_row_init_x86(int global);
+
+#endif /* AVUTIL_X86_BLEND_H */
diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c
new file mode 100644
index 0000000..f555dfa
--- /dev/null
+++ b/libavutil/x86/blend_init.c
@@ -0,0 +1,369 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/blend.h"
+
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+// per-pixel blend (8 pixels at a time.)
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0,
+                                         const uint8_t *src1,
+                                         const uint8_t *alpha,
+                                         uint8_t *dst,
+                                         int width)
+{
+    int aligned_w = width/8 * 8;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
+    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
+            "psllw      $0x8,%%xmm3                    \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "movd       %%eax,%%xmm3                   \n\t"
+            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "movd       %%eax,%%xmm5                   \n\t"
+            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
+            "sub        %2,%0                          \n\t"
+            "sub        %2,%1                          \n\t"
+            "sub        %2,%3                          \n\t"
+
+            // 8 pixel per loop.
+            "1:                                        \n\t"
+            "movq       (%2),%%xmm0                    \n\t"
+            "punpcklbw  %%xmm0,%%xmm0                  \n\t"
+            "pxor       %%xmm3,%%xmm0                  \n\t"
+            "movq       (%0,%2,1),%%xmm1               \n\t"
+            "movq       (%1,%2,1),%%xmm2               \n\t"
+            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
+            "psubb      %%xmm4,%%xmm1                  \n\t"
+            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
+            "paddw      %%xmm5,%%xmm0                  \n\t"
+            "psrlw      $0x8,%%xmm0                    \n\t"
+            "packuswb   %%xmm0,%%xmm0                  \n\t"
+            "movq       %%xmm0,(%3,%2,1)               \n\t"
+            "lea        0x8(%2),%2                     \n\t"
+            "sub        $0x8,%4                        \n\t"
+            "jg        1b                              \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+        alpha_u+= 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+    }
+}
+
+// global blend (8 pixels at a time).
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_ssse3(const uint8_t *src0,
+                                      const uint8_t *src1,
+                                      const uint8_t *alpha,
+                                      uint8_t *dst,
+                                      int width)
+{
+    int aligned_w = width/8 * 8;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "pcmpeqb    %%xmm3,%%xmm3                  \n\t"
+            "psllw      $0x8,%%xmm3                    \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "movd       %%eax,%%xmm4                   \n\t"
+            "pshufd     $0x0,%%xmm4,%%xmm4             \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "movd       %%eax,%%xmm5                   \n\t"
+            "pshufd     $0x0,%%xmm5,%%xmm5             \n\t"
+            // a => xmm6 [a a a a a a a a a a a a a a a a ]
+            "movb       (%2),%%al                      \n\t"
+            "movd       %%eax,%%xmm6                   \n\t" // xmm6 = x x x x x x x x x x x x x x x a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x x x x x x x a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x x x x x a a a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = x x x x x x x x a a a a a a a a
+            "punpcklbw  %%xmm6,%%xmm6                  \n\t" // xmm6 = a a a a a a a a a a a a a a a a
+
+            // 8 pixel per loop.
+            "1:                                        \n\t"
+            "movdqu     %%xmm6,%%xmm0                  \n\t" // xmm0 = xmm6
+            "pxor       %%xmm3,%%xmm0                  \n\t"
+
+            "movq       (%0),%%xmm1                    \n\t"
+            "movq       (%1),%%xmm2                    \n\t"
+            "punpcklbw  %%xmm2,%%xmm1                  \n\t"
+            "psubb      %%xmm4,%%xmm1                  \n\t"
+
+            "pmaddubsw  %%xmm1,%%xmm0                  \n\t"
+            "paddw      %%xmm5,%%xmm0                  \n\t"
+            "psrlw      $0x8,%%xmm0                    \n\t"
+            "packuswb   %%xmm0,%%xmm0                  \n\t"
+            "movq       %%xmm0,(%3)                    \n\t"
+
+            "lea        0x8(%0),%0                     \n\t" // src0+8
+            "lea        0x8(%1),%1                     \n\t" // src1+8
+            "lea        0x8(%3),%3                     \n\t" // dst+8
+            "sub        $0x8,%4                        \n\t"
+            "jg        1b                              \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+// per-pixe blend (32 pixels at a time).
+// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256
+static void ff_per_pixel_blend_row_avx2(const uint8_t *src0,
+                                        const uint8_t *src1,
+                                        const uint8_t *alpha,
+                                        uint8_t *dst,
+                                        int width)
+{
+    int aligned_w = width/32 * 32;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u  = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u  = (uint8_t *)src1 + aligned_w;
+    uint8_t *alpha_u = (uint8_t *)alpha + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
+            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm6                   \n\t"
+            "vbroadcastss %%xmm6,%%ymm6                \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm7                   \n\t"
+            "vbroadcastss %%xmm7,%%ymm7                \n\t"
+            "sub        %2,%0                          \n\t"
+            "sub        %2,%1                          \n\t"
+            "sub        %2,%3                          \n\t"
+
+            // 32 pixel per loop.
+            "1:                                        \n\t"
+            "vmovdqu    (%2),%%ymm0                    \n\t"
+            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
+            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
+            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
+            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
+            "vmovdqu    (%0,%2,1),%%ymm1               \n\t"
+            "vmovdqu    (%1,%2,1),%%ymm2               \n\t"
+            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
+            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
+            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
+            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
+            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
+            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
+            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
+            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
+            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
+            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
+            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
+            "vmovdqu    %%ymm0,(%3,%2,1)               \n\t"
+            "lea        0x20(%2),%2                    \n\t"
+            "sub        $0x20,%4                       \n\t"
+            "jg        1b                              \n\t"
+            "vzeroupper                                \n\t"
+            : "+r"(src0),      // %0
+              "+r"(src1),      // %1
+              "+r"(alpha),     // %2
+              "+r"(dst),       // %3
+              "+rm"(aligned_w) // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+             "xmm7");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - alpha_u[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+        alpha_u+= 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - alpha_u[0]) + 255) >> 8;
+    }
+}
+
+// global blend (32 pixels at a time)
+// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256
+static void ff_global_blend_row_avx2(const uint8_t *src0,
+                                     const uint8_t *src1,
+                                     const uint8_t *alpha,
+                                     uint8_t *dst,
+                                     int width)
+{
+    int aligned_w = width/32 * 32;
+    int width_u = width - aligned_w;
+    uint8_t *src0_u = (uint8_t *)src0 + aligned_w;
+    uint8_t *src1_u = (uint8_t *)src1 + aligned_w;
+    uint8_t *dst_u  = dst + aligned_w;
+    int i;
+
+    if (aligned_w > 0) {
+        __asm__ volatile(
+            "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n\t"
+            "vpsllw     $0x8,%%ymm5,%%ymm5             \n\t"
+            "mov        $0x80808080,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm6                   \n\t"
+            "vbroadcastss %%xmm6,%%ymm6                \n\t"
+            "mov        $0x807f807f,%%eax              \n\t"
+            "vmovd      %%eax,%%xmm7                   \n\t"
+            "vbroadcastss %%xmm7,%%ymm7                \n\t"
+            // a => ymm8 [a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a
+            //            a a a a a a a a a a a a a a a a]
+            "movb       (%2),%%al                      \n\t"
+            "movd       %%eax,%%xmm8                   \n\t" // xmm8 = x x x x x x x x x x x x x x x a
+            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x x x x x x x x x x x x x x a a
+            "punpcklbw  %%xmm8,%%xmm8                  \n\t" // xmm8 = x x x x x x x x x x x x a a a a
+            "vbroadcastss %%xmm8,%%ymm8                \n\t"
+
+            // 32 pixel per loop.
+            "1:                                        \n\t"
+            "vmovdqu    %%ymm8,%%ymm0                  \n\t"
+            "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n\t"
+            "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n\t"
+            "vpxor      %%ymm5,%%ymm3,%%ymm3           \n\t"
+            "vpxor      %%ymm5,%%ymm0,%%ymm0           \n\t"
+
+            "vmovdqu    (%0),%%ymm1                    \n\t"
+            "vmovdqu    (%1),%%ymm2                    \n\t"
+            "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n\t"
+            "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n\t"
+            "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n\t"
+            "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n\t"
+            "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n\t"
+            "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n\t"
+            "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n\t"
+            "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n\t"
+            "vpsrlw     $0x8,%%ymm3,%%ymm3             \n\t"
+            "vpsrlw     $0x8,%%ymm0,%%ymm0             \n\t"
+            "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n\t"
+
+            "vmovdqu    %%ymm0,(%3)                    \n\t"
+            "lea        0x20(%0),%0                    \n\t"
+            "lea        0x20(%1),%1                    \n\t"
+            "lea        0x20(%3),%3                    \n\t"
+            "sub        $0x20,%4                       \n\t"
+            "jg        1b                              \n\t"
+            "vzeroupper                                \n\t"
+            : "+r"(src0),       // %0
+              "+r"(src1),       // %1
+              "+r"(alpha),      // %2
+              "+r"(dst),        // %3
+              "+rm"(aligned_w)  // %4
+            ::"memory",
+             "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+             "xmm7", "xmm8");
+    }
+
+    for (i = 0; i < width_u - 1; i += 2) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+        dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + 255) >> 8;
+        src0_u += 2;
+        src1_u += 2;
+        dst_u  += 2;
+    }
+    if (width_u & 1) {
+        dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + 255) >> 8;
+    }
+}
+#endif
+
+av_cold blend_row ff_blend_row_init_x86(int global)
+{
+    blend_row blend_row_fn = NULL;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (global) {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            blend_row_fn = ff_global_blend_row_ssse3;
+        }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            blend_row_fn = ff_global_blend_row_avx2;
+        }
+#endif
+    } else {
+#if HAVE_SSSE3_INLINE && HAVE_6REGS
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            blend_row_fn = ff_per_pixel_blend_row_ssse3;
+        }
+#endif
+
+#if HAVE_AVX2_INLINE && HAVE_6REGS
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            blend_row_fn = ff_per_pixel_blend_row_avx2;
+        }
+#endif
+    }
+
+    return blend_row_fn;
+}
-- 
1.7.1



More information about the ffmpeg-devel mailing list