[FFmpeg-devel] [PATCH 7/9] sbcenc: add MMX optimizations

Aurelien Jacobs aurel at gnuage.org
Sat Feb 24 14:05:50 EET 2018


On Thu, Feb 22, 2018 at 05:21:57PM +0000, Rostislav Pehlivanov wrote:
> On 21 February 2018 at 22:37, Aurelien Jacobs <aurel at gnuage.org> wrote:
> [...]
> > +;*******************************************************************
> > +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t
> > *consts);
> > +;*******************************************************************
> > +INIT_MMX mmx
> > +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> > +    movq          m0, [inq]
> > +    movq          m1, [inq+8]
> > +    pmaddwd       m0, [constsq]
> > +    pmaddwd       m1, [constsq+8]
> > +    paddd         m0, [scale_mask]
> > +    paddd         m1, [scale_mask]
> > +
> > +    movq          m2, [inq+16]
> > +    movq          m3, [inq+24]
> > +    pmaddwd       m2, [constsq+16]
> > +    pmaddwd       m3, [constsq+24]
> > +    paddd         m0, m2
> > +    paddd         m1, m3
> > +
> > +    movq          m2, [inq+32]
> > +    movq          m3, [inq+40]
> > +    pmaddwd       m2, [constsq+32]
> > +    pmaddwd       m3, [constsq+40]
> > +    paddd         m0, m2
> > +    paddd         m1, m3
> > +
> > +    movq          m2, [inq+48]
> > +    movq          m3, [inq+56]
> > +    pmaddwd       m2, [constsq+48]
> > +    pmaddwd       m3, [constsq+56]
> > +    paddd         m0, m2
> > +    paddd         m1, m3
> > +
> > +    movq          m2, [inq+64]
> > +    movq          m3, [inq+72]
> > +    pmaddwd       m2, [constsq+64]
> > +    pmaddwd       m3, [constsq+72]
> > +    paddd         m0, m2
> > +    paddd         m1, m3
> >
> 
> You can macro the top 3 blocks
> 
> [...]
> > +;*******************************************************************
> > +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t
> > *consts);
> > +;*******************************************************************
> > +INIT_MMX mmx
> > +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> > +    movq          m0, [inq]
> > +    movq          m1, [inq+8]
> > +    movq          m2, [inq+16]
> > +    movq          m3, [inq+24]
> > +    pmaddwd       m0, [constsq]
> > +    pmaddwd       m1, [constsq+8]
> > +    pmaddwd       m2, [constsq+16]
> > +    pmaddwd       m3, [constsq+24]
> > +    paddd         m0, [scale_mask]
> > +    paddd         m1, [scale_mask]
> > +    paddd         m2, [scale_mask]
> > +    paddd         m3, [scale_mask]
> > +
> > +    movq          m4, [inq+32]
> > +    movq          m5, [inq+40]
> > +    movq          m6, [inq+48]
> > +    movq          m7, [inq+56]
> > +    pmaddwd       m4, [constsq+32]
> > +    pmaddwd       m5, [constsq+40]
> > +    pmaddwd       m6, [constsq+48]
> > +    pmaddwd       m7, [constsq+56]
> > +    paddd         m0, m4
> > +    paddd         m1, m5
> > +    paddd         m2, m6
> > +    paddd         m3, m7
> > +
> > +    movq          m4, [inq+64]
> > +    movq          m5, [inq+72]
> > +    movq          m6, [inq+80]
> > +    movq          m7, [inq+88]
> > +    pmaddwd       m4, [constsq+64]
> > +    pmaddwd       m5, [constsq+72]
> > +    pmaddwd       m6, [constsq+80]
> > +    pmaddwd       m7, [constsq+88]
> > +    paddd         m0, m4
> > +    paddd         m1, m5
> > +    paddd         m2, m6
> > +    paddd         m3, m7
> > +
> > +    movq          m4, [inq+96]
> > +    movq          m5, [inq+104]
> > +    movq          m6, [inq+112]
> > +    movq          m7, [inq+120]
> > +    pmaddwd       m4, [constsq+96]
> > +    pmaddwd       m5, [constsq+104]
> > +    pmaddwd       m6, [constsq+112]
> > +    pmaddwd       m7, [constsq+120]
> > +    paddd         m0, m4
> > +    paddd         m1, m5
> > +    paddd         m2, m6
> > +    paddd         m3, m7
> > +
> > +    movq          m4, [inq+128]
> > +    movq          m5, [inq+136]
> > +    movq          m6, [inq+144]
> > +    movq          m7, [inq+152]
> > +    pmaddwd       m4, [constsq+128]
> > +    pmaddwd       m5, [constsq+136]
> > +    pmaddwd       m6, [constsq+144]
> > +    pmaddwd       m7, [constsq+152]
> > +    paddd         m0, m4
> > +    paddd         m1, m5
> > +    paddd         m2, m6
> > +    paddd         m3, m7
> >
> 
> And those 5 blocks
> 
> 
> > +
> > +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
> > +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
> > +    psrad         m2, 16    ; SBC_PROTO_FIXED_SCALE
> > +    psrad         m3, 16    ; SBC_PROTO_FIXED_SCALE
> > +
> > +    packssdw      m0, m0
> > +    packssdw      m1, m1
> > +    packssdw      m2, m2
> > +    packssdw      m3, m3
> > +
> > +    movq          m4, m0
> > +    movq          m5, m0
> > +    pmaddwd       m4, [constsq+160]
> > +    pmaddwd       m5, [constsq+168]
> > +
> > +    movq          m6, m1
> > +    movq          m7, m1
> > +    pmaddwd       m6, [constsq+192]
> > +    pmaddwd       m7, [constsq+200]
> > +    paddd         m4, m6
> > +    paddd         m5, m7
> > +
> > +    movq          m6, m2
> > +    movq          m7, m2
> > +    pmaddwd       m6, [constsq+224]
> > +    pmaddwd       m7, [constsq+232]
> > +    paddd         m4, m6
> > +    paddd         m5, m7
> > +
> > +    movq          m6, m3
> > +    movq          m7, m3
> > +    pmaddwd       m6, [constsq+256]
> > +    pmaddwd       m7, [constsq+264]
> > +    paddd         m4, m6
> > +    paddd         m5, m7
> >
> 
> Reuse the first macro here
> 
> Should save quite a bit of code

OK, here is a "macroified" version of the code.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0007-sbcenc-add-MMX-optimizations.patch
Type: text/x-diff
Size: 10891 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20180224/34068760/attachment.patch>


More information about the ffmpeg-devel mailing list