[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide

Timothy Gu timothygu99 at gmail.com
Sun Feb 14 01:27:17 CET 2016


---

The reason why this function uses SSE4.1 is the roundps instruction. Would
love to find a way to truncate a float to integer in SSE2.

---
 libavfilter/x86/vf_blend.asm    | 32 ++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_blend_init.c |  6 ++++++
 2 files changed, 38 insertions(+)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index a5ea74c..dac04d7 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
 
 SECTION_RODATA
 
+ps_255: times 4 dd 255.0
 pw_1:   times 8 dw 1
 pw_128: times 8 dw 128
 pw_255: times 8 dw 255
@@ -285,3 +286,34 @@ INIT_XMM sse2
 BLEND_ABS
 INIT_XMM ssse3
 BLEND_ABS
+
+INIT_XMM sse4
+BLEND_INIT divide, 4
+    pxor       m2, m2
+    mova       m3, [ps_255]
+.nextrow:
+    mov        xq, widthq
+
+    .loop:
+        movd            m0, [topq + xq]      ; 000000xx
+        movd            m1, [bottomq + xq]
+        punpcklbw       m0, m2               ; 00000x0x
+        punpcklbw       m1, m2
+        punpcklwd       m0, m2               ; 000x000x
+        punpcklwd       m1, m2
+
+        cvtdq2ps        m0, m0
+        cvtdq2ps        m1, m1
+        divps           m0, m1               ; a / b
+        mulps           m0, m3               ; a / b * 255
+        roundps         m0, m0, 3            ; truncate
+        minps           m0, m3
+        cvtps2dq        m0, m0
+
+        packusdw        m0, m0               ; 00000x0x
+        packuswb        m0, m0               ; 000000xx
+        movd   [dstq + xq], m0
+        add             xq, mmsize / 4
+
+    jl .loop
+BLEND_END
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index a6baf94..f542870 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
 BLEND_FUNC(difference, ssse3)
 BLEND_FUNC(negation, sse2)
 BLEND_FUNC(negation, ssse3)
+BLEND_FUNC(divide, sse4)
 
 av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
 {
@@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   break;
         }
     }
+    if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
+        switch (param->mode) {
+        case BLEND_DIVIDE:   param->blend = ff_blend_divide_sse4;   break;
+        }
+    }
 }
-- 
2.1.4



More information about the ffmpeg-devel mailing list