[FFmpeg-devel] [PATCHv2] lavu/x86/lls: add fma3 optimizations for update_lls
James Almer
jamrial at gmail.com
Fri Jan 15 01:23:04 CET 2016
On 1/14/2016 9:06 PM, Ganesh Ajjanagadde wrote:
> On Thu, Jan 14, 2016 at 6:54 PM, Ganesh Ajjanagadde <gajjanag at mit.edu> wrote:
>> On Thu, Jan 14, 2016 at 6:42 PM, James Almer <jamrial at gmail.com> wrote:
>>> On 1/14/2016 7:46 PM, Ganesh Ajjanagadde wrote:
>>>> This improves accuracy (very slightly) and speed for processors having
>>>> fma3.
>>>>
>>>> Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
>>>> old:
>>>> 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
>>>> 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>>>>
>>>> new:
>>>> 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
>>>> 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>>>>
>>>> Tested with FATE and --disable-fma3, also examined contents of
>>>> lavu/lls-test.
>>>>
>>>> Reviewed-by: James Almer <jamrial at gmail.com>
>>>> Reviewed-by: Henrik Gramner <henrik at gramner.com>
>>>> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde at gmail.com>
>>>> ---
>>>> libavutil/x86/lls.asm | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
>>>> libavutil/x86/lls_init.c | 4 ++++
>>>> 2 files changed, 63 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
>>>> index 769befb..925cbdb 100644
>>>> --- a/libavutil/x86/lls.asm
>>>> +++ b/libavutil/x86/lls.asm
>>>> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
>>>> .ret:
>>>> REP_RET
>>>>
>>>> -%if HAVE_AVX_EXTERNAL
>>>> -INIT_YMM avx
>>>> +%macro UPDATE_LLS 0
>>>> cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>>>> %define covarq ctxq
>>>> mov countd, [ctxq + LLSModel.indep_count]
>>>> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>>>> vbroadcastsd ymm6, [varq + iq*8 + 16]
>>>> vbroadcastsd ymm7, [varq + iq*8 + 24]
>>>> vextractf128 xmm3, ymm1, 1
>>>> +%if cpuflag(fma3)
>>>> + mova ymm0, COVAR(iq ,0)
>>>> + mova xmm2, COVAR(iq+2,2)
>>>> + fmaddpd ymm0, ymm1, ymm4, ymm0
>>>> + fmaddpd xmm2, xmm3, xmm6, xmm2
>>>> + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
>>>> + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
>>>> + mova COVAR(iq ,0), ymm0
>>>> + mova COVAR(iq ,1), ymm1
>>>> + mova COVAR(iq+2,2), xmm2
>>>> + mova COVAR(iq+2,3), xmm3
>>>> +%else
>>>> vmulpd ymm0, ymm1, ymm4
>>>> vmulpd ymm1, ymm1, ymm5
>>>> vmulpd xmm2, xmm3, xmm6
>>>> @@ -148,12 +159,27 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>>>> ADDPD_MEM COVAR(iq ,1), ymm1
>>>> ADDPD_MEM COVAR(iq+2,2), xmm2
>>>> ADDPD_MEM COVAR(iq+2,3), xmm3
>>>> +%endif ; cpuflag(fma3)
>>>> lea jd, [iq + 4]
>>>> cmp jd, count2d
>>>> jg .skip4x4
>>>> .loop4x4:
>>>> ; Compute all 16 pairwise products of a 4x4 block
>>>> mova ymm3, [varq + jq*8]
>>>> +%if cpuflag(fma3)
>>>> + mova ymm0, COVAR(jq, 0)
>>>> + mova ymm1, COVAR(jq, 1)
>>>> + mova ymm2, COVAR(jq, 2)
>>>> + mova ymm3, COVAR(jq, 3)
>>>
>>> This is wrong. You're overwriting the contents of ymm3. libavutil/lls-test didn't reflect
>>> this at all?
>>>
>>>> + fmaddpd ymm0, ymm3, ymm4, ymm0
>>>> + fmaddpd ymm1, ymm3, ymm5, ymm1
>>>> + fmaddpd ymm2, ymm3, ymm6, ymm2
>>>> + fmaddpd ymm3, ymm3, ymm7, ymm3
>>>> + mova COVAR(jq, 0), ymm0
>>>> + mova COVAR(jq, 1), ymm1
>>>> + mova COVAR(jq, 2), ymm2
>>>> + mova COVAR(jq, 3), ymm3
>>>> +%else
>>>> vmulpd ymm0, ymm3, ymm4
>>>> vmulpd ymm1, ymm3, ymm5
>>>> vmulpd ymm2, ymm3, ymm6
>>>> @@ -162,6 +188,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>>>> ADDPD_MEM COVAR(jq,1), ymm1
>>>> ADDPD_MEM COVAR(jq,2), ymm2
>>>> ADDPD_MEM COVAR(jq,3), ymm3
>>>> +%endif ; cpuflag(fma3)
>>>> add jd, 4
>>>> cmp jd, count2d
>>>> jle .loop4x4
>>>> @@ -169,6 +196,20 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>>>> cmp jd, countd
>>>> jg .skip2x4
>>>> mova xmm3, [varq + jq*8]
>>>> +%if cpuflag(fma3)
>>>> + mova xmm0, COVAR(jq, 0)
>>>> + mova xmm1, COVAR(jq, 1)
>>>> + mova xmm2, COVAR(jq, 2)
>>>> + mova xmm3, COVAR(jq, 3)
>>>
>>> Same here.
>>
>> Diff /tmp/lls.old /tmp/lls.new:
>> < real:-0.326534 order:0 pred:-0.588792 var:0.272632 coeffs:1.427836
>> -0.385559 0.000000
>> < real:-0.326534 order:1 pred:-0.326534 var:-nan coeffs:3.274787
>> -1.436430 0.000000
>> < real:-0.326534 order:2 pred:-0.326534 var:0.000000 coeffs:3.274787
>> -1.436430 0.000000
>> ---
>>> real:-0.326534 order:0 pred:-0.588792 var:0.272632 coeffs:1.427836 -0.385559 -0.000000
>>> real:-0.326534 order:1 pred:-0.326534 var:0.000000 coeffs:3.274787 -1.436430 0.000000
>>> real:-0.326534 order:2 pred:-0.326534 var:-nan coeffs:3.274787 -1.436430 -0.000000
>>
>> You are definitely right, though. Removed 2 lines:
>> mova ymm3, COVAR(jq,3)
>> and
>> mova xmm3, COVAR(jq,2)
>
> Still buggy. Will calmly examine this some time.
You also need to use COVAR(jq, #) as the fourth argument for the two fmaddpd, like you did
in .loopi
More information about the ffmpeg-devel
mailing list