[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide
Timothy Gu
timothygu99 at gmail.com
Sun Feb 14 01:27:17 CET 2016
---
The reason why this function uses SSE4.1 is the roundps instruction. Would
love to find a way to truncate a float to integer in SSE2.
---
libavfilter/x86/vf_blend.asm | 32 ++++++++++++++++++++++++++++++++
libavfilter/x86/vf_blend_init.c | 6 ++++++
2 files changed, 38 insertions(+)
diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index a5ea74c..dac04d7 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
SECTION_RODATA
+ps_255: times 4 dd 255.0
pw_1: times 8 dw 1
pw_128: times 8 dw 128
pw_255: times 8 dw 255
@@ -285,3 +286,34 @@ INIT_XMM sse2
BLEND_ABS
INIT_XMM ssse3
BLEND_ABS
+
+INIT_XMM sse4
+BLEND_INIT divide, 4
+ pxor m2, m2
+ mova m3, [ps_255]
+.nextrow:
+ mov xq, widthq
+
+ .loop:
+ movd m0, [topq + xq] ; 000000xx
+ movd m1, [bottomq + xq]
+ punpcklbw m0, m2 ; 00000x0x
+ punpcklbw m1, m2
+ punpcklwd m0, m2 ; 000x000x
+ punpcklwd m1, m2
+
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ divps m0, m1 ; a / b
+ mulps m0, m3 ; a / b * 255
+ roundps m0, m0, 3 ; truncate
+ minps m0, m3
+ cvtps2dq m0, m0
+
+ packusdw m0, m0 ; 00000x0x
+ packuswb m0, m0 ; 000000xx
+ movd [dstq + xq], m0
+ add xq, mmsize / 4
+
+ jl .loop
+BLEND_END
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index a6baf94..f542870 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
BLEND_FUNC(difference, ssse3)
BLEND_FUNC(negation, sse2)
BLEND_FUNC(negation, ssse3)
+BLEND_FUNC(divide, sse4)
av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
{
@@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
}
}
+ if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
+ switch (param->mode) {
+ case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break;
+ }
+ }
}
--
2.1.4
More information about the ffmpeg-devel
mailing list