[FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

Tue Dec 3 10:59:06 EET 2019

comments inline in code


At 2019-12-03 15:52:07, xujunzz at sjtu.edu.cn wrote:
>From: Xu Jun <xujunzz at sjtu.edu.cn>
>
>+; void filter_column(uint8_t *dst, int height,
>+;                         float rdiv, float bias, const int *const matrix,
>+;                         const uint8_t *c[], int length, int radius,
>+;                         int dstride, int stride);
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
>+%else
>+cglobal filter_column, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r

>+%endif
no idea, these are difficult to read and understand


>+
>+%if WIN64
>+    SWAP m0, m2
>+    SWAP m1, m3
>+    mov r2q, matrixmp
>+    mov r3q, ptrmp
>+    mov r4q, widthmp
>+    mov r5q, radmp
>+    mov r6q, dstridemp
>+    mov r7q, stridemp
>+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+sal radq, 1

>+add radq, 1     ;2*radius+1
I don't know how about compare to "LEA x,[y*2+1]"
And....I want not discuss in between SAL and SHL


>+movsxdifnidn heightq, heightd
>+VBROADCASTSS m0, m0
>+VBROADCASTSS m1, m1
>+pxor m6, m6
>+movss m5, [half]
>+VBROADCASTSS m5, m5
>+
>+xor dst_offq, dst_offq
>+xor c_offq, c_offq
>+
>+.loopy:
>+    xor off16q, off16q
>+    cmp widthq, mmsize/4
>+    jl .loopr
>+
>+    mov rq, widthq
>+    and rq, mmsize/4-1
>+    sub widthq, rq
>+

>+    .loop16: ;parallel process 16 elements in a row
Processing 4 column per loop, are you means, we want to save lots of unused register?
We claim X64, so we have 16 of XMMs


>+        pxor m4, m4
>+        xor iq, iq
>+        .loopi:

>+            movss m2, [matrixq + 4*iq]
no idea that you working on Float data path, we are lucky, Intel CPU sounds not penalty in here.


>+            VBROADCASTSS m2, m2
>+            mov ciq, [ptrq + iq * gprsize]
>+            movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
>+            punpcklbw m3, m6

>+            punpcklwd m3, m6
Since you claim SSE4, the instruction PMOVZXBD available, moreover, SSE4 register can be full fill 16 of uint8, but load 4 of them only.


>+            pmulld m2, m3
>+            paddd m4, m2
>+
>+            add iq, 1

>+            cmp iq, radq
When you initial iq to radq and decrement per loop, you can reduce one instruction
I know iq is work as index in the loop, but we can found some trick over there.
>+            jl .loopi
>+
>+        cvtdq2ps m4, m4
>+        mulps m4, m0     ; sum *= rdiv
>+        addps m4, m1     ; sum += bias

>+        addps m4, m5     ; sum += 0.5
I don't know how about precision mismatch if we pre-compute (bias+0.5)


>+        cvttps2dq m4, m4
>+        packssdw m4, m4
>+        packuswb m4, m4
>+        movss [dstq + dst_offq], m4
>+        add c_offq, mmsize/4
>+        add dst_offq, mmsize/4
>+
>+        add off16q, mmsize/4
>+        cmp off16q, widthq
>+        jl .loop16
>+
>+    add widthq, rq
>+    cmp off16q, widthq
>+    jge .paraend
>+

>+    .loopr:
no idea about this loop, if we can read beyond, we can reuse above SIMD code


>+        xor sumd, sumd
>+        xor iq, iq
>+        .loopr_i:
>+            mov ciq, [ptrq + iq * gprsize]
>+            movzx rd, byte [ciq + c_offq]
>+            imul rd, [matrixq + 4*iq]
>+            add sumd, rd
>+
>+            add iq, 1
>+            cmp iq, radq
>+            jl .loopr_i
>+
>+        pxor m4, m4
>+        cvtsi2ss m4, sumd
>+        mulss m4, m0     ; sum *= rdiv
>+        addss m4, m1     ; sum += bias
>+        addss m4, m5     ; sum += 0.5
>+        cvttps2dq m4, m4
>+        packssdw m4, m4
>+        packuswb m4, m4
>+        movd sumd, m4
>+        mov [dstq + dst_offq], sumb
>+        add c_offq, 1
>+        add dst_offq, 1
>+        add off16q, 1
>+        cmp off16q, widthq
>+        jl .loopr
>+
>+    .paraend:
>+    sub c_offq, widthq
>+    sub dst_offq, widthq
>+    add c_offq, strideq
>+    add dst_offq, dstrideq
>+
>+    sub heightq, 1
>+    cmp heightq, 0
>+    jg .loopy
>+
>+.end:
>+    RET