[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Martin Vignali
martin.vignali at gmail.com
Sun Dec 3 23:16:24 EET 2017
Checkasm result (osx) for your last patch :
hflip_byte_c: 28.5
hflip_byte_ssse3: 29.0
hflip_short_c: 277.7
hflip_short_ssse3: 65.0
if you add a "cmp xq, wq" after the simd loop
you can be faster than c (clang), if width is multiple of mmsize*2
hflip_byte_c: 28.5
hflip_byte_ssse3: 27.5
see below
otherwise looks ok (i will send later a much cleaner patch for the checkasm,
and a patch to use one macro for both func)
+
> +pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> +pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +
> +SECTION .text
> +
> +INIT_XMM ssse3
> +cglobal hflip_byte, 3, 6, 3, src, dst, w, x, v, r
>
+ mova m0, [pb_flip_byte]
> + mov xq, 0
> + mov wd, dword wm
> + mov rq, wq
> + and rq, 2 * mmsize - 1
> + cmp wq, 2 * mmsize
> + jl .loop1
> + sub wq, rq
> +
> + .loop0:
> + neg xq
> + movu m1, [srcq + xq - mmsize + 1]
> + movu m2, [srcq + xq - 2 * mmsize + 1]
> + pshufb m1, m0
> + pshufb m2, m0
> + neg xq
> + movu [dstq + xq ], m1
> + movu [dstq + xq + mmsize], m2
> + add xq, mmsize * 2
> + cmp xq, wq
> + jl .loop0
>
cmp xq, wq
je .end
> +
> + add wq, rq
> +
> + .loop1:
> + neg xq
> + mov vb, [srcq + xq]
> + neg xq
> + mov [dstq + xq], vb
> + add xq, 1
> + cmp xq, wq
> + jl .loop1
>
.end:
> +RET
>
More information about the ffmpeg-devel
mailing list