[FFmpeg-devel] [PATCH 7/8] lavc/flacenc: add AVX2 version of the 32-bit LPC encoder
Rostislav Pehlivanov
atomnuker at gmail.com
Mon Nov 27 01:13:46 EET 2017
On 26 November 2017 at 22:51, James Darnley <james.darnley at gmail.com> wrote:
> When compared to the SSE4.2 version runtime, is reduced by 1 to 26%. The
> function itself is around 2 times faster.
> ---
> libavcodec/x86/flac_dsp_gpl.asm | 56 ++++++++++++++++++++++++++++++
> +----------
> libavcodec/x86/flacdsp_init.c | 5 +++-
> 2 files changed, 47 insertions(+), 14 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 91989ce560..749e66dec8 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -22,11 +22,11 @@
>
> %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> +SECTION_RODATA 32
>
> -pd_0_int_min: times 2 dd 0, -2147483648
> -pq_int_min: times 2 dq -2147483648
> -pq_int_max: times 2 dq 2147483647
> +pd_0_int_min: times 4 dd 0, -2147483648
> +pq_int_min: times 4 dq -2147483648
> +pq_int_max: times 4 dq 2147483647
>
> SECTION .text
>
> @@ -123,7 +123,10 @@ RET
> %endmacro
>
> %macro PMINSQ 3
> - pcmpgtq %3, %2, %1
> + mova %3, %2
> + ; We cannot use the 3-operand format because the memory location
> cannot be
> + ; the second operand, only the third.
> + pcmpgtq %3, %1
>
I don't get it, how did it work before then?
> pand %1, %3
> pandn %3, %2
> por %1, %3
> @@ -177,11 +180,11 @@ lea resq, [resq+orderq*4]
> lea smpq, [smpq+orderq*4]
> lea coefsq, [coefsq+orderq*4]
> sub length, orderd
> -movd m3, r5m
> +movd xm3, r5m
> neg orderq
>
> movu m4, [pd_0_int_min] ; load 1 bit
> -psrad m4, m3 ; turn that into shift+1 bits
> +psrad m4, xm3 ; turn that into shift+1 bits
> pslld m4, 1 ; reduce that
> mova [rsp], m4 ; save sign extend mask
>
> @@ -197,8 +200,20 @@ mova [rsp], m4 ; save sign extend mask
> xor negj, negj
>
> .looporder1:
> +%if cpuflag(avx)
> + vbroadcastss m2, [coefsq+posj*4]
> +%else
> movd m2, [coefsq+posj*4] ; c = coefs[j]
> SPLATD m2
> +%endif
> +%if cpuflag(avx)
> + vpmuldq m1, m2, [smpq+negj*4-4]
> + vpmuldq m5, m2, [smpq+negj*4-4+mmsize]
> + vpmuldq m7, m2, [smpq+negj*4-4+mmsize*2]
> + vpaddq m0, m1
> + vpaddq m4, m5
> + vpaddq m6, m7
>
Why force VEX encoding for these instructions, on avx no less?
> +%else
> movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
> movu m5, [smpq+negj*4-4+mmsize]
> movu m7, [smpq+negj*4-4+mmsize*2]
> @@ -212,14 +227,15 @@ mova [rsp], m4 ; save sign extend mask
> paddq m0, m1 ; p += c * s
> paddq m4, m5
> paddq m6, m7
> +%endif
>
> dec negj
> inc posj
> jnz .looporder1
>
> - HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
> - HACK_PSRAQ m4, m3, [rsp], m2
> - HACK_PSRAQ m6, m3, [rsp], m2
> + HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift
> + HACK_PSRAQ m4, xm3, [rsp], m2
> + HACK_PSRAQ m6, xm3, [rsp], m2
> CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> CLIPQ m4, [pq_int_min], [pq_int_max], m2
> CLIPQ m6, [pq_int_min], [pq_int_max], m2
> @@ -241,8 +257,20 @@ mova [rsp], m4 ; save sign extend mask
> xor negj, negj
>
> .looporder2:
> +%if cpuflag(avx)
> + vbroadcastss m2, [coefsq+posj*4]
> +%else
> movd m2, [coefsq+posj*4] ; c = coefs[j]
> SPLATD m2
> +%endif
> +%if cpuflag(avx)
> + vpmuldq m1, m2, [smpq+negj*4]
> + vpmuldq m5, m2, [smpq+negj*4+mmsize]
> + vpmuldq m7, m2, [smpq+negj*4+mmsize*2]
> + vpaddq m0, m1
> + vpaddq m4, m5
> + vpaddq m6, m7
> +%else
> movu m1, [smpq+negj*4] ; s = smp[i-j-1]
> movu m5, [smpq+negj*4+mmsize]
> movu m7, [smpq+negj*4+mmsize*2]
> @@ -252,14 +280,15 @@ mova [rsp], m4 ; save sign extend mask
> paddq m0, m1 ; p += c * s
> paddq m4, m5
> paddq m6, m7
> +%endif
>
> dec negj
> inc posj
> jnz .looporder2
>
> - HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
> - HACK_PSRAQ m4, m3, [rsp], m2
> - HACK_PSRAQ m6, m3, [rsp], m2
> + HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift
> + HACK_PSRAQ m4, xm3, [rsp], m2
> + HACK_PSRAQ m6, xm3, [rsp], m2
> CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> CLIPQ m4, [pq_int_min], [pq_int_max], m2
> CLIPQ m6, [pq_int_min], [pq_int_max], m2
> @@ -300,3 +329,4 @@ FUNCTION_BODY_32
>
> INIT_YMM avx2
> FUNCTION_BODY_16
> +FUNCTION_BODY_32
> diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
> index f827186c26..fbe70894a0 100644
> --- a/libavcodec/x86/flacdsp_init.c
> +++ b/libavcodec/x86/flacdsp_init.c
> @@ -30,6 +30,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int
> coeffs[32], int order,
> void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
> void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
> void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int,
> const int32_t *,int);
> +void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
>
> #define DECORRELATE_FUNCS(fmt, opt)
> \
> void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in,
> int channels, \
> @@ -117,8 +118,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c,
> enum AVSampleFormat fmt, int
> c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
> }
> if (EXTERNAL_AVX2(cpu_flags)) {
> - if (CONFIG_GPL)
> + if (CONFIG_GPL) {
> c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
> + c->lpc32_encode = ff_flac_enc_lpc_32_avx2;
> + }
> }
> #endif
> #endif /* HAVE_X86ASM */
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
More information about the ffmpeg-devel
mailing list