[FFmpeg-devel] [PATCH 2/2] tta/x86: add ff_ttafilter_process_dec_{ssse3, sse4}
James Almer
jamrial at gmail.com
Tue Feb 11 02:34:17 CET 2014
On 10/02/14 10:12 PM, Christophe Gisquet wrote:
> Hi,
>
> 2014-02-11 0:51 GMT+01:00 James Almer <jamrial at gmail.com>:
>
>> +INIT_XMM ssse3
>> +cglobal ttafilter_process_dec, 2,2,8, filter, in
>> + mova m2, [filterq + 0xc ]
>> + mova m3, [filterq + 0x1c]
>> + mova m4, [filterq + 0x4c]
>> + mova m5, [filterq + 0x5c]
>> +
>> + movd m6, [filterq + 0x8 ]
>
> err... I'm missing something or is this code prone to perform aligned
> reads on unaligned addresses?
>
> Seems a bit lucky because of this:
> typedef struct TTAFilter {
> int32_t shift, round, error;
> int32_t qm[MAX_ORDER];
> int32_t dx[MAX_ORDER];
> int32_t dl[MAX_ORDER];
> } TTAFilter;
>
> [...]
>
> typedef struct TTAChannel {
> int32_t predictor;
> TTAFilter filter;
> TTARice rice;
> } TTAChannel;
>
> sizeof(TTAChannel) should be 4*(3*MAX_ORDER+8)=224 (multiple of 16)
> and the offset of the TTAFilter 4, so all mova actually end up on
> aligned addresses.
>
> This seems to highly rely on the TTAChannel being allocated at an
> aligned address, the structures not being modified, a compiler not
> deciding to add padding between struct members, and the offsets
> remaining constant. It is probably ok, but I'm not terribly confident
> here.
>
If it's preferred i can rewrite this so we call the asm functions with
each of the TTAFilter elements as arguments instead of the struct as a
whole.
>> + pshufd m6, m6, 0
>> + psignd m0, m4, m6
>> + psignd m1, m5, m6
>> + paddd m2, m0
>> + paddd m3, m1
>> + mova [filterq + 0xc ], m2
>> + mova [filterq + 0x1c], m3
>> +
> [...]
>> +
>> + psignd m1, [pd_m1]
>> + psrldq m1, 4
>> + pslldq m6, 12
>> + pshufd m6, m6, 0xf0
>> + paddd m1, m6
>> + psrldq m6, m1, 4
>> + pshufd m6, m6, 0xf4
>> + paddd m1, m6
>> + psrldq m6, 4
>> + paddd m1, m6
>> + mova [filterq + 0x9c], m1
>> + RET
>> +
>> +INIT_XMM sse4
>> +cglobal ttafilter_process_dec, 2,2,7, filter, in
>> + mova m2, [filterq + 0xc ]
>> + mova m3, [filterq + 0x1c]
>> + mova m4, [filterq + 0x4c]
>> + mova m5, [filterq + 0x5c]
>> +
>> + movd m6, [filterq + 0x8 ] ; if (filter->error < 0) {
>> + pshufd m6, m6, 0 ; for (int i = 0; i < 8; i++)
>> + psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
>> + psignd m1, m5, m6 ; } else if (filter->error > 0) {
>> + paddd m2, m0 ; for (int i = 0; i < 8; i++)
>> + paddd m3, m1 ; filter->qm[i] += filter->dx[i];
>> + mova [filterq + 0xc ], m2 ; }
>> + mova [filterq + 0x1c], m3 ;
>> +
> [...]
>> + psignd m1, [pd_m1] ;
>> + psrldq m1, 4 ;
>> + pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
>> + pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
>> + paddd m1, m0 ; filter->dl[6] = *in - filter->dl[7];
>> + psrldq m0, m1, 4 ; filter->dl[7] = *in;
>> + pshufd m0, m0, 0xf4 ; filter->dl[5] += filter->dl[6];
>> + paddd m1, m0 ; filter->dl[4] += filter->dl[5];
>> + psrldq m0, 4 ;
>> + paddd m1, m0 ;
>> + mova [filterq + 0x9c], m1 ;
>> + RET
>
> Could you please macroize this code? Of interest are the macros
> ARCH_X86_64/mmsize (8/16/32, size of main SIMD reg)/cpuflag()
>
> For your above case, this gives for instance:
> %macro TTA_FILTER 0
> cglobal ttafilter_process_dec, 2,2,7, filter, in
> mova m2, [filterq + 0xc ]
> mova m3, [filterq + 0x1c]
> mova m4, [filterq + 0x4c]
> mova m5, [filterq + 0x5c]
>
> movd m6, [filterq + 0x8 ] ; if (filter->error < 0) {
> pshufd m6, m6, 0 ; for (int i = 0; i < 8; i++)
> psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
> psignd m1, m5, m6 ; } else if (filter->error > 0) {
> paddd m2, m0 ; for (int i = 0; i < 8; i++)
> paddd m3, m1 ; filter->qm[i] += filter->dx[i];
> mova [filterq + 0xc ], m2 ; }
> mova [filterq + 0x1c], m3 ;
> %if cpuflags(sse4)
> ; some code
> %else
> ; SSSE3 code
> %endif
> ; tail
> %endmacro
>
> INIT_XMM ssse3
> TTA_FILTER
>
> INIT_XMM sse4
> TTA_FILTER
>
> There are plenty of such examples, just grep for them in libavcodec/x86
I didn't macroize because the latter part of the SSE4 version used
different registers in most instructions (Since it needs less than the
SSSE3 one so i reused a few).
I'll make both versions use the same registers nonetheless so i can use a
macro.
>
> I haven't quite checked if the code is optimal, but I haven't seen any
> other issue. Maybe using more registers to break dependencies, but
> that's a short function, and there's no loop to amortize their use.
>
> Thanks,
>
More information about the ffmpeg-devel
mailing list