[FFmpeg-devel] [PATCH 7/8] lavc/flacenc: add AVX2 version of the 32-bit LPC encoder
James Darnley
james.darnley at gmail.com
Mon Nov 27 00:51:10 EET 2017
When compared to the SSE4.2 version runtime, is reduced by 1 to 26%. The
function itself is around 2 times faster.
---
libavcodec/x86/flac_dsp_gpl.asm | 56 +++++++++++++++++++++++++++++++----------
libavcodec/x86/flacdsp_init.c | 5 +++-
2 files changed, 47 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 91989ce560..749e66dec8 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -22,11 +22,11 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
-pd_0_int_min: times 2 dd 0, -2147483648
-pq_int_min: times 2 dq -2147483648
-pq_int_max: times 2 dq 2147483647
+pd_0_int_min: times 4 dd 0, -2147483648
+pq_int_min: times 4 dq -2147483648
+pq_int_max: times 4 dq 2147483647
SECTION .text
@@ -123,7 +123,10 @@ RET
%endmacro
%macro PMINSQ 3
- pcmpgtq %3, %2, %1
+ mova %3, %2
+ ; We cannot use the 3-operand format because the memory location cannot be
+ ; the second operand, only the third.
+ pcmpgtq %3, %1
pand %1, %3
pandn %3, %2
por %1, %3
@@ -177,11 +180,11 @@ lea resq, [resq+orderq*4]
lea smpq, [smpq+orderq*4]
lea coefsq, [coefsq+orderq*4]
sub length, orderd
-movd m3, r5m
+movd xm3, r5m
neg orderq
movu m4, [pd_0_int_min] ; load 1 bit
-psrad m4, m3 ; turn that into shift+1 bits
+psrad m4, xm3 ; turn that into shift+1 bits
pslld m4, 1 ; reduce that
mova [rsp], m4 ; save sign extend mask
@@ -197,8 +200,20 @@ mova [rsp], m4 ; save sign extend mask
xor negj, negj
.looporder1:
+%if cpuflag(avx)
+ vbroadcastss m2, [coefsq+posj*4]
+%else
movd m2, [coefsq+posj*4] ; c = coefs[j]
SPLATD m2
+%endif
+%if cpuflag(avx)
+ vpmuldq m1, m2, [smpq+negj*4-4]
+ vpmuldq m5, m2, [smpq+negj*4-4+mmsize]
+ vpmuldq m7, m2, [smpq+negj*4-4+mmsize*2]
+ vpaddq m0, m1
+ vpaddq m4, m5
+ vpaddq m6, m7
+%else
movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
movu m5, [smpq+negj*4-4+mmsize]
movu m7, [smpq+negj*4-4+mmsize*2]
@@ -212,14 +227,15 @@ mova [rsp], m4 ; save sign extend mask
paddq m0, m1 ; p += c * s
paddq m4, m5
paddq m6, m7
+%endif
dec negj
inc posj
jnz .looporder1
- HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
- HACK_PSRAQ m4, m3, [rsp], m2
- HACK_PSRAQ m6, m3, [rsp], m2
+ HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift
+ HACK_PSRAQ m4, xm3, [rsp], m2
+ HACK_PSRAQ m6, xm3, [rsp], m2
CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
CLIPQ m4, [pq_int_min], [pq_int_max], m2
CLIPQ m6, [pq_int_min], [pq_int_max], m2
@@ -241,8 +257,20 @@ mova [rsp], m4 ; save sign extend mask
xor negj, negj
.looporder2:
+%if cpuflag(avx)
+ vbroadcastss m2, [coefsq+posj*4]
+%else
movd m2, [coefsq+posj*4] ; c = coefs[j]
SPLATD m2
+%endif
+%if cpuflag(avx)
+ vpmuldq m1, m2, [smpq+negj*4]
+ vpmuldq m5, m2, [smpq+negj*4+mmsize]
+ vpmuldq m7, m2, [smpq+negj*4+mmsize*2]
+ vpaddq m0, m1
+ vpaddq m4, m5
+ vpaddq m6, m7
+%else
movu m1, [smpq+negj*4] ; s = smp[i-j-1]
movu m5, [smpq+negj*4+mmsize]
movu m7, [smpq+negj*4+mmsize*2]
@@ -252,14 +280,15 @@ mova [rsp], m4 ; save sign extend mask
paddq m0, m1 ; p += c * s
paddq m4, m5
paddq m6, m7
+%endif
dec negj
inc posj
jnz .looporder2
- HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
- HACK_PSRAQ m4, m3, [rsp], m2
- HACK_PSRAQ m6, m3, [rsp], m2
+ HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift
+ HACK_PSRAQ m4, xm3, [rsp], m2
+ HACK_PSRAQ m6, xm3, [rsp], m2
CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
CLIPQ m4, [pq_int_min], [pq_int_max], m2
CLIPQ m6, [pq_int_min], [pq_int_max], m2
@@ -300,3 +329,4 @@ FUNCTION_BODY_32
INIT_YMM avx2
FUNCTION_BODY_16
+FUNCTION_BODY_32
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index f827186c26..fbe70894a0 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -30,6 +30,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
#define DECORRELATE_FUNCS(fmt, opt) \
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
@@ -117,8 +118,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
}
if (EXTERNAL_AVX2(cpu_flags)) {
- if (CONFIG_GPL)
+ if (CONFIG_GPL) {
c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
+ c->lpc32_encode = ff_flac_enc_lpc_32_avx2;
+ }
}
#endif
#endif /* HAVE_X86ASM */
--
2.15.0
More information about the ffmpeg-devel
mailing list