[FFmpeg-devel] [PATCH] libavutil: add x86 optimized av_popcount

Wed Feb 25 22:35:26 CET 2015

On Wed, Feb 25, 2015 at 02:29:11PM -0300, James Almer wrote:
> On 25/02/15 12:43 PM, Clément Bœsch wrote:
> > On Tue, Feb 24, 2015 at 10:05:24PM -0300, James Almer wrote:
> >> Signed-off-by: James Almer <jamrial at gmail.com>
> >> ---
> >> I decided to go the configure route since other features (cmov, clz) also do
> >> it , but if prefered this could instead be done with a new intmath.h header 
> >> in the x86/ folder containing something like
> >>
> >> #if defined(__GNUC__) && defined(__POPCNT__)
> >>     #define av_popcount   __builtin_popcount
> >> #if ARCH_X86_64
> >>     #define av_popcount64 __builtin_popcountll
> >> #endif
> >> #endif
> >>
> >> For a cleaner compile time check.
> >>
> >>  configure           | 12 ++++++++++--
> >>  libavutil/intmath.h | 13 +++++++++++++
> >>  2 files changed, 23 insertions(+), 2 deletions(-)
> >>
> > 
> > For the record, the builtin implementation looks like this here:
> > 
> > 0000000000000000 <av_popcount_c>:
> >    0:   89 f8                   mov    %edi,%eax
> >    2:   d1 e8                   shr    %eax
> >    4:   25 55 55 55 55          and    $0x55555555,%eax
> >    9:   29 c7                   sub    %eax,%edi
> >    b:   89 fa                   mov    %edi,%edx
> >    d:   c1 ef 02                shr    $0x2,%edi
> >   10:   81 e2 33 33 33 33       and    $0x33333333,%edx
> >   16:   81 e7 33 33 33 33       and    $0x33333333,%edi
> >   1c:   8d 04 17                lea    (%rdi,%rdx,1),%eax
> >   1f:   89 c2                   mov    %eax,%edx
> >   21:   c1 ea 04                shr    $0x4,%edx
> >   24:   01 d0                   add    %edx,%eax
> >   26:   25 0f 0f 0f 0f          and    $0xf0f0f0f,%eax
> >   2b:   89 c2                   mov    %eax,%edx
> >   2d:   c1 ea 08                shr    $0x8,%edx
> >   30:   01 d0                   add    %edx,%eax
> >   32:   89 c2                   mov    %eax,%edx
> >   34:   c1 ea 10                shr    $0x10,%edx
> >   37:   01 d0                   add    %edx,%eax
> >   39:   83 e0 3f                and    $0x3f,%eax
> >   3c:   c3                      retq   
> >   3d:   0f 1f 00                nopl   (%rax)
> > 
> > 0000000000000040 <popcount_gcc>:
> >   40:   48 83 ec 08             sub    $0x8,%rsp
> >   44:   89 ff                   mov    %edi,%edi
> >   46:   e8 00 00 00 00          callq  4b <popcount_gcc+0xb>
> >   4b:   48 83 c4 08             add    $0x8,%rsp
> >   4f:   c3                      retq   
> > 
> > 0000000000000040 <popcount_clang>:
> >   40:   89 f8                   mov    %edi,%eax
> >   42:   d1 e8                   shr    %eax
> >   44:   25 55 55 55 55          and    $0x55555555,%eax
> >   49:   29 c7                   sub    %eax,%edi
> >   4b:   89 f8                   mov    %edi,%eax
> >   4d:   25 33 33 33 33          and    $0x33333333,%eax
> >   52:   c1 ef 02                shr    $0x2,%edi
> >   55:   81 e7 33 33 33 33       and    $0x33333333,%edi
> >   5b:   01 c7                   add    %eax,%edi
> >   5d:   89 f8                   mov    %edi,%eax
> >   5f:   c1 e8 04                shr    $0x4,%eax
> >   62:   01 f8                   add    %edi,%eax
> >   64:   25 0f 0f 0f 0f          and    $0xf0f0f0f,%eax
> >   69:   69 c0 01 01 01 01       imul   $0x1010101,%eax,%eax
> >   6f:   c1 e8 18                shr    $0x18,%eax
> >   72:   c3                      retq   
> > 
> > We might see relevant "optimizations" for our reference code.
> 
> What's clang code for av_popcount64_c, or their builtin?

0000000000000000 <popcount64_clang>:
   0:   48 89 f8                mov    rax,rdi
   3:   48 d1 e8                shr    rax,1
   6:   48 b9 55 55 55 55 55    movabs rcx,0x5555555555555555
   d:   55 55 55 
  10:   48 21 c1                and    rcx,rax
  13:   48 29 cf                sub    rdi,rcx
  16:   48 b8 33 33 33 33 33    movabs rax,0x3333333333333333
  1d:   33 33 33 
  20:   48 89 f9                mov    rcx,rdi
  23:   48 21 c1                and    rcx,rax
  26:   48 c1 ef 02             shr    rdi,0x2
  2a:   48 21 c7                and    rdi,rax
  2d:   48 01 cf                add    rdi,rcx
  30:   48 89 f8                mov    rax,rdi
  33:   48 c1 e8 04             shr    rax,0x4
  37:   48 01 f8                add    rax,rdi
  3a:   48 b9 0f 0f 0f 0f 0f    movabs rcx,0xf0f0f0f0f0f0f0f
  41:   0f 0f 0f 
  44:   48 21 c1                and    rcx,rax
  47:   48 b8 01 01 01 01 01    movabs rax,0x101010101010101
  4e:   01 01 01 
  51:   48 0f af c1             imul   rax,rcx
  55:   48 c1 e8 38             shr    rax,0x38
  59:   c3                      ret    
  5a:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]

0000000000000060 <av_popcount64_c>:
  60:   89 f8                   mov    eax,edi
  62:   d1 e8                   shr    eax,1
  64:   25 55 55 55 55          and    eax,0x55555555
  69:   89 f9                   mov    ecx,edi
  6b:   29 c1                   sub    ecx,eax
  6d:   89 c8                   mov    eax,ecx
  6f:   25 33 33 33 33          and    eax,0x33333333
  74:   c1 e9 02                shr    ecx,0x2
  77:   81 e1 33 33 33 33       and    ecx,0x33333333
  7d:   01 c1                   add    ecx,eax
  7f:   89 c8                   mov    eax,ecx
  81:   c1 e8 04                shr    eax,0x4
  84:   01 c8                   add    eax,ecx
  86:   25 0f 0f 0f 0f          and    eax,0xf0f0f0f
  8b:   89 c1                   mov    ecx,eax
  8d:   c1 e9 08                shr    ecx,0x8
  90:   01 c1                   add    ecx,eax
  92:   89 c8                   mov    eax,ecx
  94:   c1 e8 10                shr    eax,0x10
  97:   01 c8                   add    eax,ecx
  99:   83 e0 3f                and    eax,0x3f
  9c:   48 89 f9                mov    rcx,rdi
  9f:   48 c1 e9 20             shr    rcx,0x20
  a3:   48 c1 ef 21             shr    rdi,0x21
  a7:   81 e7 55 55 55 55       and    edi,0x55555555
  ad:   29 f9                   sub    ecx,edi
  af:   89 ca                   mov    edx,ecx
  b1:   81 e2 33 33 33 33       and    edx,0x33333333
  b7:   c1 e9 02                shr    ecx,0x2
  ba:   81 e1 33 33 33 33       and    ecx,0x33333333
  c0:   01 d1                   add    ecx,edx
  c2:   89 ca                   mov    edx,ecx
  c4:   c1 ea 04                shr    edx,0x4
  c7:   01 ca                   add    edx,ecx
  c9:   81 e2 0f 0f 0f 0f       and    edx,0xf0f0f0f
  cf:   89 d1                   mov    ecx,edx
  d1:   c1 e9 08                shr    ecx,0x8
  d4:   01 d1                   add    ecx,edx
  d6:   89 ca                   mov    edx,ecx
  d8:   c1 ea 10                shr    edx,0x10
  db:   01 ca                   add    edx,ecx
  dd:   83 e2 3f                and    edx,0x3f
  e0:   01 d0                   add    eax,edx
  e2:   c3                      ret    


> We're currently calling av_popcount_c twice from within av_popcount64_c, 
> when on x86_64 cpus we could probably take advantage of the 64bits gprs.
> 

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150225/e14318ec/attachment.asc>