[FFmpeg-cvslog] avfilter/x86/vf_blend : add 16 bit version for BLEND_SIMPLE, phoenix, difference for SSE and AVX2 (x86_64)

Martin Vignali git at videolan.org
Sat Feb 24 22:46:10 EET 2018


ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Sat Feb 17 21:01:34 2018 +0100| [53a03b5c8c7d355bd353727115efc9977aa76f28] | committer: Martin Vignali

avfilter/x86/vf_blend : add 16 bit version for BLEND_SIMPLE, phoenix, difference for SSE and AVX2 (x86_64)

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=53a03b5c8c7d355bd353727115efc9977aa76f28
---

 libavfilter/x86/vf_blend.asm    | 75 ++++++++++++++++++++++++++++++++++-------
 libavfilter/x86/vf_blend_init.c | 54 +++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 680e266348..5d9a909192 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -36,10 +36,13 @@ pb_255: times 16 db 255
 
 SECTION .text
 
-%macro BLEND_INIT 2
+%macro BLEND_INIT 2-3
 %if ARCH_X86_64
 cglobal blend_%1, 6, 9, %2, top, top_linesize, bottom, bottom_linesize, dst, dst_linesize, width, end, x
     mov    widthd, dword widthm
+    %if %0 == 3; is 16 bit
+        add    widthq, widthq ; doesn't compile on x86_32
+    %endif
 %else
 cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end, x
 %define dst_linesizeq r5mp
@@ -61,8 +64,8 @@ cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end
 REP_RET
 %endmacro
 
-%macro BLEND_SIMPLE 2
-BLEND_INIT %1, 2
+%macro BLEND_SIMPLE 2-3
+BLEND_INIT %1, 2, %3
 .nextrow:
     mov        xq, widthq
 
@@ -270,8 +273,9 @@ BLEND_INIT divide, 4
 BLEND_END
 %endmacro
 
-%macro PHOENIX 0
-BLEND_INIT phoenix, 4
+%macro PHOENIX 2-3
+; %1 name, %2 b or w, %3 (opt) 1 if 16 bit
+BLEND_INIT %1, 4, %3
     VBROADCASTI128       m3, [pb_255]
 .nextrow:
     mov        xq, widthq
@@ -280,19 +284,19 @@ BLEND_INIT phoenix, 4
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
         mova            m2, m0
-        pminub          m0, m1
-        pmaxub          m1, m2
+        pminu%2         m0, m1
+        pmaxu%2         m1, m2
         mova            m2, m3
-        psubusb         m2, m1
-        paddusb         m2, m0
+        psubus%2        m2, m1
+        paddus%2        m2, m0
         mova   [dstq + xq], m2
         add             xq, mmsize
     jl .loop
 BLEND_END
 %endmacro
 
-%macro BLEND_ABS 0
-BLEND_INIT difference, 5
+%macro DIFFERENCE 1-2
+BLEND_INIT %1, 5, %2
     pxor       m2, m2
 .nextrow:
     mov        xq, widthq
@@ -300,6 +304,17 @@ BLEND_INIT difference, 5
     .loop:
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
+%if %0 == 2 ; 16 bit
+        punpckhwd       m3, m0, m2
+        punpcklwd       m0, m2
+        punpckhwd       m4, m1, m2
+        punpcklwd       m1, m2
+        psubd           m0, m1
+        psubd           m3, m4
+        pabsd           m0, m0
+        pabsd           m3, m3
+        packusdw        m0, m3
+%else
         punpckhbw       m3, m0, m2
         punpcklbw       m0, m2
         punpckhbw       m4, m1, m2
@@ -308,11 +323,14 @@ BLEND_INIT difference, 5
         psubw           m3, m4
         ABS2            m0, m3, m1, m4
         packuswb        m0, m3
+%endif
         mova   [dstq + xq], m0
         add             xq, mmsize
     jl .loop
 BLEND_END
+%endmacro
 
+%macro BLEND_ABS 0
 BLEND_INIT extremity, 8
     pxor       m2, m2
     VBROADCASTI128       m4, [pw_255]
@@ -378,14 +396,32 @@ BLEND_SCREEN
 AVERAGE
 GRAINMERGE
 HARDMIX
-PHOENIX
+PHOENIX phoenix, b
+DIFFERENCE difference
 DIVIDE
 
 BLEND_ABS
 
+%if ARCH_X86_64
+BLEND_SIMPLE addition_16, addusw, 1
+BLEND_SIMPLE and_16,      and,    1
+BLEND_SIMPLE or_16,       or,     1
+BLEND_SIMPLE subtract_16, subusw, 1
+BLEND_SIMPLE xor_16,      xor,    1
+%endif
+
 INIT_XMM ssse3
+DIFFERENCE difference
 BLEND_ABS
 
+INIT_XMM sse4
+%if ARCH_X86_64
+BLEND_SIMPLE darken_16,   minuw, 1
+BLEND_SIMPLE lighten_16,  maxuw, 1
+PHOENIX      phoenix_16,      w, 1
+DIFFERENCE   difference_16,      1
+%endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 BLEND_SIMPLE xor,      xor
@@ -401,7 +437,20 @@ BLEND_SCREEN
 AVERAGE
 GRAINMERGE
 HARDMIX
-PHOENIX
+PHOENIX phoenix, b
 
+DIFFERENCE difference
 BLEND_ABS
+
+%if ARCH_X86_64
+BLEND_SIMPLE addition_16, addusw, 1
+BLEND_SIMPLE and_16,      and,    1
+BLEND_SIMPLE darken_16,   minuw,  1
+BLEND_SIMPLE lighten_16,  maxuw,  1
+BLEND_SIMPLE or_16,       or,     1
+BLEND_SIMPLE subtract_16, subusw, 1
+BLEND_SIMPLE xor_16,      xor,    1
+PHOENIX      phoenix_16,       w, 1
+DIFFERENCE   difference_16,       1
+%endif
 %endif
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index c9c7a52ef9..0962f6d7fd 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -69,6 +69,27 @@ BLEND_FUNC(negation, sse2)
 BLEND_FUNC(negation, ssse3)
 BLEND_FUNC(negation, avx2)
 
+#if ARCH_X86_64
+BLEND_FUNC(addition_16, sse2)
+BLEND_FUNC(addition_16, avx2)
+BLEND_FUNC(and_16, sse2)
+BLEND_FUNC(and_16, avx2)
+BLEND_FUNC(darken_16, sse4)
+BLEND_FUNC(darken_16, avx2)
+BLEND_FUNC(difference_16, sse4)
+BLEND_FUNC(difference_16, avx2)
+BLEND_FUNC(lighten_16, sse4)
+BLEND_FUNC(lighten_16, avx2)
+BLEND_FUNC(or_16, sse2)
+BLEND_FUNC(or_16, avx2)
+BLEND_FUNC(phoenix_16, sse4)
+BLEND_FUNC(phoenix_16, avx2)
+BLEND_FUNC(subtract_16, sse2)
+BLEND_FUNC(subtract_16, avx2)
+BLEND_FUNC(xor_16, sse2)
+BLEND_FUNC(xor_16, avx2)
+#endif /* ARCH_X86_64 */
+
 av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -125,5 +146,38 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
             case BLEND_NEGATION:     param->blend = ff_blend_negation_avx2;     break;
             }
         }
+    } else { /* is_16_bit */
+#if ARCH_X86_64
+        if (EXTERNAL_SSE2(cpu_flags) && param->opacity == 1) {
+            switch (param->mode) {
+            case BLEND_ADDITION: param->blend = ff_blend_addition_16_sse2; break;
+            case BLEND_AND:      param->blend = ff_blend_and_16_sse2;      break;
+            case BLEND_OR:       param->blend = ff_blend_or_16_sse2;       break;
+            case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_sse2; break;
+            case BLEND_XOR:      param->blend = ff_blend_xor_16_sse2;      break;
+            }
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) {
+            switch (param->mode) {
+            case BLEND_DARKEN:   param->blend = ff_blend_darken_16_sse4;     break;
+            case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break;
+            case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_sse4;    break;
+            case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_sse4;    break;
+            }
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) {
+            switch (param->mode) {
+            case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break;
+            case BLEND_AND:      param->blend = ff_blend_and_16_avx2;      break;
+            case BLEND_DARKEN:   param->blend = ff_blend_darken_16_avx2;   break;
+            case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break;
+            case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_avx2;  break;
+            case BLEND_OR:       param->blend = ff_blend_or_16_avx2;       break;
+            case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_avx2;  break;
+            case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_avx2; break;
+            case BLEND_XOR:      param->blend = ff_blend_xor_16_avx2;      break;
+            }
+        }
+#endif /* ARCH_X86_64 */
     }
 }



More information about the ffmpeg-cvslog mailing list