[FFmpeg-devel] [PATCH 6/8] lavc/x86/flac_dsp_gpl: partially unroll 32-bit LPC encoder
James Darnley
james.darnley at gmail.com
Mon Nov 27 00:51:09 EET 2017
Around 1.1 times faster and reduces runtime by up to 6%.
---
libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++++---------
1 file changed, 72 insertions(+), 19 deletions(-)
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 952fc8b86b..91989ce560 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -152,13 +152,13 @@ RET
%macro FUNCTION_BODY_32 0
%if ARCH_X86_64
- cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
+ cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, coefs
DECLARE_REG_TMP 5, 6
%define length r2d
movsxd orderq, orderd
%else
- cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
+ cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, coefs
DECLARE_REG_TMP 2, 5
%define length r2mp
%endif
@@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask
%define negj t1q
.looplen:
+ ; process "odd" samples
pxor m0, m0
pxor m4, m4
pxor m6, m6
mov posj, orderq
xor negj, negj
- .looporder:
+ .looporder1:
movd m2, [coefsq+posj*4] ; c = coefs[j]
SPLATD m2
- pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1]
- pmovzxdq m5, [smpq+negj*4-4+mmsize/2]
- pmovzxdq m7, [smpq+negj*4-4+mmsize]
+ movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+ movu m5, [smpq+negj*4-4+mmsize]
+ movu m7, [smpq+negj*4-4+mmsize*2]
+ ; Rather than explicitly unpack adjacent samples into qwords we can let
+ ; the pmuldq instruction unpack the 0th and 2nd samples for us when it
+ ; does its multiply. This saves an unpack for every sample in the inner
+ ; loop meaning it should be (much) quicker.
pmuldq m1, m2
pmuldq m5, m2
pmuldq m7, m2
@@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask
dec negj
inc posj
- jnz .looporder
+ jnz .looporder1
HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
HACK_PSRAQ m4, m3, [rsp], m2
@@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask
CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
CLIPQ m4, [pq_int_min], [pq_int_max], m2
CLIPQ m6, [pq_int_min], [pq_int_max], m2
- pshufd m0, m0, q0020 ; pack into first 2 dwords
- pshufd m4, m4, q0020
- pshufd m6, m6, q0020
- movh m1, [smpq]
- movh m5, [smpq+mmsize/2]
- movh m7, [smpq+mmsize]
+ movu m1, [smpq]
+ movu m5, [smpq+mmsize]
+ movu m7, [smpq+mmsize*2]
psubd m1, m0 ; smp[i] - p
psubd m5, m4
psubd m7, m6
- movh [resq], m1 ; res[i] = smp[i] - (p >> shift)
- movh [resq+mmsize/2], m5
- movh [resq+mmsize], m7
+ mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift)
+ mova [rsp+mmsize*2], m5
+ mova [rsp+mmsize*3], m7
+
+ ; process "even" samples
+ pxor m0, m0
+ pxor m4, m4
+ pxor m6, m6
+ mov posj, orderq
+ xor negj, negj
+
+ .looporder2:
+ movd m2, [coefsq+posj*4] ; c = coefs[j]
+ SPLATD m2
+ movu m1, [smpq+negj*4] ; s = smp[i-j-1]
+ movu m5, [smpq+negj*4+mmsize]
+ movu m7, [smpq+negj*4+mmsize*2]
+ pmuldq m1, m2
+ pmuldq m5, m2
+ pmuldq m7, m2
+ paddq m0, m1 ; p += c * s
+ paddq m4, m5
+ paddq m6, m7
+
+ dec negj
+ inc posj
+ jnz .looporder2
+
+ HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
+ HACK_PSRAQ m4, m3, [rsp], m2
+ HACK_PSRAQ m6, m3, [rsp], m2
+ CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+ CLIPQ m4, [pq_int_min], [pq_int_max], m2
+ CLIPQ m6, [pq_int_min], [pq_int_max], m2
+ movu m1, [smpq+4]
+ movu m5, [smpq+4+mmsize]
+ movu m7, [smpq+4+mmsize*2]
+ psubd m1, m0 ; smp[i] - p
+ psubd m5, m4
+ psubd m7, m6
+
+ ; interleave odd and even samples
+ pslldq m1, 4
+ pslldq m5, 4
+ pslldq m7, 4
+
+ pblendw m1, [rsp+mmsize], q0303
+ pblendw m5, [rsp+mmsize*2], q0303
+ pblendw m7, [rsp+mmsize*3], q0303
+
+ movu [resq], m1
+ movu [resq+mmsize], m5
+ movu [resq+mmsize*2], m7
+
+ add resq, 3*mmsize
+ add smpq, 3*mmsize
+ sub length, (3*mmsize)/4
- add resq, (3*mmsize)/2
- add smpq, (3*mmsize)/2
- sub length, (3*mmsize)/8
jg .looplen
RET
--
2.15.0
More information about the ffmpeg-devel
mailing list