[Ffmpeg-devel] PATCH Blackfin optimized byte swapping mechanism
Marc Hoffman
mmh
Mon Apr 23 19:12:47 CEST 2007
Michael Niedermayer writes:
> Hi
>
> On Tue, Apr 17, 2007 at 08:49:40AM -0400, Marc Hoffman wrote:
> > Michael Niedermayer writes:
> > > Hi
> > >
> > > On Tue, Apr 17, 2007 at 07:40:47AM -0400, Marc Hoffman wrote:
> > > Content-Description: message body text
> > > >
> > > > > Low level bswap primitive for the Blackfin Architecture.
> > > >
> > > > sorry mangled patch wrong encoding last time.
> > >
> > > what advantage do these functions have over the default?
> > > are they faster? if so you should provide some benchmarks
> >
> > Sorry about the top post please forgive me
> >
> > The current 32bit byte swap routine produces this code sequence
> >
> > So I guess this is about 300% improvement in performance for this function.
>
> guess is good, hard benchmark is better, its just 5min work to write a
> loop of bswap and do a time myprog
> also dont forget to set proper -mcpu / -march and -O3 with gcc
correction ~200%. Is the patch acceptable now?
yoda:~/bs mmh$ bfin-linux-uclibc-gcc -O3 bswap.c -o bswap/bs/bswap
yoda:~/bs mmh$ rsh -l root mad /u/bs/bswap
fast is 12608161
slow is 24637378
improvement: 195.408180
yoda:~/bs mmh$
#include <stdio.h>
#define av_always_inline
typedef unsigned int uint32_t;
static av_always_inline uint32_t fast_bswap_32(uint32_t x){
unsigned tmp;
asm("%1 = %0 >> 8 (V);\n\t"
"%0 = %0 << 8 (V);\n\t"
"%0 = %0 | %1;\n\t"
"%0 = PACK(%0.L, %0.H);\n\t"
: "+d"(x), "=&d"(tmp));
return x;
}
static av_always_inline uint32_t slow_bswap_32(uint32_t x){
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
return (x>>16) | (x<<16);
}
unsigned long buf[2048];
#define clock() ({ int _t; asm volatile ("%0=cycles;" : "=d" (_t)); _t; })
#define clockdiff(x) ({ int _t; asm volatile ("%0=cycles; %0=%0-%1;" : "=d" (_t) : "d" (x)); _t; })
main ()
{
int j;
int i;
long st,t0,t1;
for (i=0; i < 2048;i++) {
buf[i]=0x11223344;
}
st = clock ();
for (j=0;j<1000;j++) {
for (i=0;i<2048;i++) {
buf[i]= fast_bswap_32(buf[i]);
}
}
t0 = clockdiff(st);
st = clock ();
for (j=0;j<1000;j++) {
for (i=0;i<2048;i++) {
buf[i]= slow_bswap_32(buf[i]);
}
}
t1 = clockdiff(st);
printf ("fast is %d\nslow is %d\nimprovement: %f\n", t0,t1, 100.0*t1/t0);
}
More information about the ffmpeg-devel
mailing list