[FFmpeg-devel] [PATCH 2/2] tta/x86: add ff_ttafilter_process_dec_{ssse3, sse4}

Tue Feb 11 02:34:17 CET 2014

On 10/02/14 10:12 PM, Christophe Gisquet wrote:
> Hi,
> 
> 2014-02-11 0:51 GMT+01:00 James Almer <jamrial at gmail.com>:
> 
>> +INIT_XMM ssse3
>> +cglobal ttafilter_process_dec, 2,2,8, filter, in
>> +    mova       m2, [filterq + 0xc ]
>> +    mova       m3, [filterq + 0x1c]
>> +    mova       m4, [filterq + 0x4c]
>> +    mova       m5, [filterq + 0x5c]
>> +
>> +    movd       m6, [filterq + 0x8 ]
> 
> err... I'm missing something or is this code prone to perform aligned
> reads on unaligned addresses?
> 
> Seems a bit lucky because of this:
> typedef struct TTAFilter {
>     int32_t shift, round, error;
>     int32_t qm[MAX_ORDER];
>     int32_t dx[MAX_ORDER];
>     int32_t dl[MAX_ORDER];
> } TTAFilter;
> 
> [...]
> 
> typedef struct TTAChannel {
>     int32_t predictor;
>     TTAFilter filter;
>     TTARice rice;
> } TTAChannel;
> 
> sizeof(TTAChannel) should be 4*(3*MAX_ORDER+8)=224 (multiple of 16)
> and the offset of the TTAFilter 4, so all mova actually end up on
> aligned addresses.
> 
> This seems to highly rely on the TTAChannel being allocated at an
> aligned address, the structures not being modified, a compiler not
> deciding to add padding between struct members, and the offsets
> remaining constant. It is probably ok, but I'm not terribly confident
> here.
> 

If it's preferred i can rewrite this so we call the asm functions with 
each of the TTAFilter elements as arguments instead of the struct as a 
whole.

>> +    pshufd     m6, m6, 0
>> +    psignd     m0, m4, m6
>> +    psignd     m1, m5, m6
>> +    paddd      m2, m0
>> +    paddd      m3, m1
>> +    mova       [filterq + 0xc ], m2
>> +    mova       [filterq + 0x1c], m3
>> +
> [...]
>> +
>> +    psignd     m1, [pd_m1]
>> +    psrldq     m1, 4
>> +    pslldq     m6, 12
>> +    pshufd     m6, m6, 0xf0
>> +    paddd      m1, m6
>> +    psrldq     m6, m1, 4
>> +    pshufd     m6, m6, 0xf4
>> +    paddd      m1, m6
>> +    psrldq     m6, 4
>> +    paddd      m1, m6
>> +    mova       [filterq + 0x9c], m1
>> +    RET
>> +
>> +INIT_XMM sse4
>> +cglobal ttafilter_process_dec, 2,2,7, filter, in
>> +    mova       m2, [filterq + 0xc ]
>> +    mova       m3, [filterq + 0x1c]
>> +    mova       m4, [filterq + 0x4c]
>> +    mova       m5, [filterq + 0x5c]
>> +
>> +    movd       m6, [filterq + 0x8 ] ; if (filter->error < 0) {
>> +    pshufd     m6, m6, 0            ;     for (int i = 0; i < 8; i++)
>> +    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
>> +    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
>> +    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
>> +    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
>> +    mova       [filterq + 0xc ], m2 ; }
>> +    mova       [filterq + 0x1c], m3 ;
>> +
> [...]
>> +    psignd     m1, [pd_m1]          ;
>> +    psrldq     m1, 4                ;
>> +    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
>> +    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
>> +    paddd      m1, m0               ; filter->dl[6] = *in - filter->dl[7];
>> +    psrldq     m0, m1, 4            ; filter->dl[7] = *in;
>> +    pshufd     m0, m0, 0xf4         ; filter->dl[5] += filter->dl[6];
>> +    paddd      m1, m0               ; filter->dl[4] += filter->dl[5];
>> +    psrldq     m0, 4                ;
>> +    paddd      m1, m0               ;
>> +    mova       [filterq + 0x9c], m1 ;
>> +    RET
> 
> Could you please macroize this code? Of interest are the macros
> ARCH_X86_64/mmsize (8/16/32, size of main SIMD reg)/cpuflag()
> 
> For your above case, this gives for instance:
> %macro TTA_FILTER 0
> cglobal ttafilter_process_dec, 2,2,7, filter, in
>    mova       m2, [filterq + 0xc ]
>    mova       m3, [filterq + 0x1c]
>    mova       m4, [filterq + 0x4c]
>    mova       m5, [filterq + 0x5c]
> 
>    movd       m6, [filterq + 0x8 ] ; if (filter->error < 0) {
>    pshufd     m6, m6, 0            ;     for (int i = 0; i < 8; i++)
>    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
>    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
>    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
>    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
>    mova       [filterq + 0xc ], m2 ; }
>    mova       [filterq + 0x1c], m3 ;
> %if cpuflags(sse4)
>    ; some code
> %else
>    ; SSSE3 code
> %endif
>    ; tail
> %endmacro
> 
> INIT_XMM ssse3
> TTA_FILTER
> 
> INIT_XMM sse4
> TTA_FILTER
> 
> There are plenty of such examples, just grep for them in libavcodec/x86

I didn't macroize because the latter part of the SSE4 version used 
different registers in most instructions (Since it needs less than the 
SSSE3 one so i reused a few).
I'll make both versions use the same registers nonetheless so i can use a 
macro.

> 
> I haven't quite checked if the code is optimal, but I haven't seen any
> other issue. Maybe using more registers to break dependencies, but
> that's a short function, and there's no loop to amortize their use.
> 
> Thanks,
>