[FFmpeg-devel] [PATCH 4/8] avcodec/flac: partially unroll loop in flac_enc_lpc_32
Rostislav Pehlivanov
atomnuker at gmail.com
Mon Nov 27 01:21:15 EET 2017
On 26 November 2017 at 22:51, James Darnley <james.darnley at gmail.com> wrote:
> Now does 6 samples per iteration, up from 2.
>
> From 1.6 to 2.1 times faster again. 2.5 to 3.9 times faster overall.
> Runtime is reduced by a further 4 to 17%. Reduced by 9 to 65% overall.
>
> Same conditions as previously.
> ---
> libavcodec/x86/flac_dsp_gpl.asm | 30 +++++++++++++++++++++++++-----
> 1 file changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 618306eb5f..4d212ed212 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -152,13 +152,13 @@ RET
> %macro FUNCTION_BODY_32 0
>
> %if ARCH_X86_64
> - cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs
> + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
> DECLARE_REG_TMP 5, 6
> %define length r2d
>
> movsxd orderq, orderd
> %else
> - cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs
> + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
> DECLARE_REG_TMP 2, 5
> %define length r2mp
> %endif
> @@ -190,6 +190,8 @@ mova [rsp], m4 ; save sign extend mask
>
> .looplen:
> pxor m0, m0
> + pxor m4, m4
> + pxor m6, m6
> mov posj, orderq
> xor negj, negj
>
> @@ -197,23 +199,41 @@ mova [rsp], m4 ; save sign extend mask
> movd m2, [coefsq+posj*4] ; c = coefs[j]
> SPLATD m2
> pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1]
> + pmovzxdq m5, [smpq+negj*4-4+mmsize/2]
> + pmovzxdq m7, [smpq+negj*4-4+mmsize]
> pmuldq m1, m2
> + pmuldq m5, m2
> + pmuldq m7, m2
> paddq m0, m1 ; p += c * s
> + paddq m4, m5
> + paddq m6, m7
>
> dec negj
> inc posj
> jnz .looporder
>
> HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
> + HACK_PSRAQ m4, m3, [rsp], m2
> + HACK_PSRAQ m6, m3, [rsp], m2
> CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> + CLIPQ m4, [pq_int_min], [pq_int_max], m2
> + CLIPQ m6, [pq_int_min], [pq_int_max], m2
> pshufd m0, m0, q0020 ; pack into first 2 dwords
> + pshufd m4, m4, q0020
> + pshufd m6, m6, q0020
> movh m1, [smpq]
> + movh m5, [smpq+mmsize/2]
> + movh m7, [smpq+mmsize]
> psubd m1, m0 ; smp[i] - p
> + psubd m5, m4
> + psubd m7, m6
> movh [resq], m1 ; res[i] = smp[i] - (p >> shift)
> + movh [resq+mmsize/2], m5
> + movh [resq+mmsize], m7
>
> - add resq, mmsize/2
> - add smpq, mmsize/2
> - sub length, mmsize/8
> + add resq, (3*mmsize)/2
> + add smpq, (3*mmsize)/2
> + sub length, (3*mmsize)/8
> jg .looplen
> RET
>
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
lgtm, tnx
More information about the ffmpeg-devel
mailing list