[FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_convolution: Add X86 SIMD optimizations for filter_column()
xujunzz at sjtu.edu.cn
xujunzz at sjtu.edu.cn
Sun Dec 22 10:37:03 EET 2019
From: Xu Jun <xujunzz at sjtu.edu.cn>
Performance improves about 10% compared to v1.
Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark
after patch:
frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 24x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=21.540s stime=2.091s rtime=7.197s
before patch:
frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=74.377s stime=1.880s rtime=16.420s
Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
---
libavfilter/x86/vf_convolution.asm | 202 ++++++++++++++++++++++++++
libavfilter/x86/vf_convolution_init.c | 9 ++
2 files changed, 211 insertions(+)
diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index 2a09374b00..4c700656d6 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -22,6 +22,8 @@
SECTION_RODATA
half: dd 0.5
+shuf_init: ddq 0x80808003808080028080800180808000
+shuf_step: ddq 0x00000004000000040000000400000004
SECTION .text
@@ -285,3 +287,203 @@ sub widthq, rq
.end:
RET
%endif
+
+; void filter_column(uint8_t *dst, int height,
+; float rdiv, float bias, const int *const matrix,
+; const uint8_t *c[], int length, int radius,
+; int dstride, int stride);
+
+%macro COMPUTE_4COL 1
+ pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s
+ pmulld m7, m5
+ paddd m1%1, m7
+%endmacro
+
+%macro CVT_PACK_COL 1
+ cvtdq2ps m1%1, m1%1
+ mulps m1%1, m0 ; sum *= rdiv
+ addps m1%1, m1 ; sum += bias
+ addps m1%1, m3 ; sum += 0.5
+ cvttps2dq m1%1, m1%1
+ packssdw m1%1, m1%1
+ packuswb m1%1, m1%1
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%else
+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%endif
+
+%if WIN64
+ SWAP m0, m2
+ SWAP m1, m3
+ mov r2q, matrixmp
+ mov r3q, ptrmp
+ mov r4q, widthmp
+ mov r5q, radmp
+ mov r6q, dstridemp
+ mov r7q, stridemp
+ DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
+ i, ci, ystride, sum, r, off16
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+lea radq, [radq * 2 + 1]
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+movsxdifnidn heightq, heightd
+
+VBROADCASTSS m0, m0 ; rdiv
+VBROADCASTSS m1, m1 ; bias
+pxor m2, m2 ; zero
+movss m3, [half]
+VBROADCASTSS m3, m3 ; 0.5
+movdqu m8, [shuf_init] ; shuffle initialization
+movdqu m9, [shuf_step] ; shuffle step
+
+xor ystrideq, ystrideq ; y*stride
+
+cmp widthq, mmsize ;if width<16 run loopr, width=16 run 16 parallel
+jl .less16
+
+.equal16:
+ pxor m10, m10
+ pxor m11, m11
+ pxor m12, m12
+ pxor m13, m13
+ ; m10-13 hold sums
+
+ lea iq, [radq - 1]
+ .loopi:
+ movd m5, [matrixq + 4*iq] ; matrix[i]
+ VBROADCASTSS m5, m5
+ mov ciq, [ptrq + iq * gprsize]
+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s
+
+ ;m4 controls shuffle
+ movdqa m4, m8
+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10
+ paddd m4, m9
+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11
+ paddd m4, m9
+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12
+ paddd m4, m9
+ COMPUTE_4COL 3 ; process 12-15 cols, sum in m13
+
+ sub iq, 1
+ jns .loopi
+
+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit
+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit
+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit
+ CVT_PACK_COL 3 ; process 12-15 cols, result in m13's low 32bit
+ punpckldq m10, m11
+ punpckldq m12, m13
+ punpcklqdq m10, m12 ; pack 16 results in m10
+ movdqu [dstq], m10
+
+ add dstq, dstrideq
+ add ystrideq, strideq
+ sub heightq, 1
+ jnz .equal16
+ jmp .end
+
+.less16:
+ xor off16q, off16q
+ cmp widthq, mmsize/4
+ jl .loopr
+
+ mov rq, widthq
+ and rq, mmsize/4-1
+ sub widthq, rq
+
+ pxor m10, m10
+ pxor m11, m11
+ pxor m12, m12
+
+ lea iq, [radq - 1]
+ .loopi_4:
+ movd m5, [matrixq + 4*iq] ; matrix[i]
+ VBROADCASTSS m5, m5
+ mov ciq, [ptrq + iq * gprsize]
+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s
+
+ ;m4 controls shuffle
+ movdqa m4, m8
+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10
+ cmp widthq, mmsize/4 ; width = 4
+ je .i4_end
+
+ paddd m4, m9
+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11
+ cmp widthq, mmsize/2 ; width = 8
+ je .i4_end
+
+ paddd m4, m9
+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12
+
+ .i4_end:
+ sub iq, 1
+ jns .loopi_4
+
+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit
+ movd [dstq], m10
+ cmp widthq, mmsize/4 ; width = 4
+ je .cvt_end
+
+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit
+ movd [dstq + mmsize/4], m11
+ cmp widthq, mmsize/2 ; width = 8
+ je .cvt_end
+
+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit
+ movd [dstq + mmsize/2], m12
+
+ .cvt_end:
+ cmp rq, 0
+ je .loopr_end
+ mov off16q, widthq
+ add widthq, rq
+
+ .loopr:
+ xor sumq, sumq
+ lea iq, [radq - 1]
+ .loopr_i:
+ mov ciq, [ptrq + iq * gprsize]
+ add ciq, ystrideq
+ movzx rd, byte [ciq + off16q]
+ imul rd, [matrixq + 4*iq]
+ add sumd, rd
+
+ sub iq, 1
+ jns .loopr_i
+
+ pxor m7, m7
+ cvtsi2ss m7, sumd
+ mulss m7, m0 ; sum *= rdiv
+ addss m7, m1 ; sum += bias
+ addss m7, m3 ; sum += 0.5
+ cvttps2dq m7, m7
+ packssdw m7, m7
+ packuswb m7, m7
+ movd sumd, m7
+ mov [dstq + off16q], sumb
+ add off16q, 1
+ cmp off16q, widthq
+ jl .loopr
+
+ .loopr_end:
+ add dstq, dstrideq
+ add ystrideq, strideq
+ sub heightq, 1
+ jnz .less16
+
+.end:
+ RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 5eb3b3bee1..da39b8a400 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -34,6 +34,11 @@ void ff_filter_row_sse4(uint8_t *dst, int width,
const uint8_t *c[], int peak, int radius,
int dstride, int stride);
+void ff_filter_column_sse4(uint8_t *dst, int height,
+ float rdiv, float bias, const int *const matrix,
+ const uint8_t *c[], int length, int radius,
+ int dstride, int stride);
+
av_cold void ff_convolution_init_x86(ConvolutionContext *s)
{
#if ARCH_X86_64
@@ -50,6 +55,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
if (EXTERNAL_SSE4(cpu_flags))
s->filter[i] = ff_filter_row_sse4;
}
+ if (s->mode[i] == MATRIX_COLUMN) {
+ if (EXTERNAL_SSE4(cpu_flags))
+ s->filter[i] = ff_filter_column_sse4;
+ }
}
#endif
}
--
2.17.1
More information about the ffmpeg-devel
mailing list