[FFmpeg-devel] [PATCH] libavutil: add x86 optimized av_popcount
Clément Bœsch
u at pkh.me
Wed Feb 25 22:35:26 CET 2015
On Wed, Feb 25, 2015 at 02:29:11PM -0300, James Almer wrote:
> On 25/02/15 12:43 PM, Clément Bœsch wrote:
> > On Tue, Feb 24, 2015 at 10:05:24PM -0300, James Almer wrote:
> >> Signed-off-by: James Almer <jamrial at gmail.com>
> >> ---
> >> I decided to go the configure route since other features (cmov, clz) also do
> >> it , but if prefered this could instead be done with a new intmath.h header
> >> in the x86/ folder containing something like
> >>
> >> #if defined(__GNUC__) && defined(__POPCNT__)
> >> #define av_popcount __builtin_popcount
> >> #if ARCH_X86_64
> >> #define av_popcount64 __builtin_popcountll
> >> #endif
> >> #endif
> >>
> >> For a cleaner compile time check.
> >>
> >> configure | 12 ++++++++++--
> >> libavutil/intmath.h | 13 +++++++++++++
> >> 2 files changed, 23 insertions(+), 2 deletions(-)
> >>
> >
> > For the record, the builtin implementation looks like this here:
> >
> > 0000000000000000 <av_popcount_c>:
> > 0: 89 f8 mov %edi,%eax
> > 2: d1 e8 shr %eax
> > 4: 25 55 55 55 55 and $0x55555555,%eax
> > 9: 29 c7 sub %eax,%edi
> > b: 89 fa mov %edi,%edx
> > d: c1 ef 02 shr $0x2,%edi
> > 10: 81 e2 33 33 33 33 and $0x33333333,%edx
> > 16: 81 e7 33 33 33 33 and $0x33333333,%edi
> > 1c: 8d 04 17 lea (%rdi,%rdx,1),%eax
> > 1f: 89 c2 mov %eax,%edx
> > 21: c1 ea 04 shr $0x4,%edx
> > 24: 01 d0 add %edx,%eax
> > 26: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax
> > 2b: 89 c2 mov %eax,%edx
> > 2d: c1 ea 08 shr $0x8,%edx
> > 30: 01 d0 add %edx,%eax
> > 32: 89 c2 mov %eax,%edx
> > 34: c1 ea 10 shr $0x10,%edx
> > 37: 01 d0 add %edx,%eax
> > 39: 83 e0 3f and $0x3f,%eax
> > 3c: c3 retq
> > 3d: 0f 1f 00 nopl (%rax)
> >
> > 0000000000000040 <popcount_gcc>:
> > 40: 48 83 ec 08 sub $0x8,%rsp
> > 44: 89 ff mov %edi,%edi
> > 46: e8 00 00 00 00 callq 4b <popcount_gcc+0xb>
> > 4b: 48 83 c4 08 add $0x8,%rsp
> > 4f: c3 retq
> >
> > 0000000000000040 <popcount_clang>:
> > 40: 89 f8 mov %edi,%eax
> > 42: d1 e8 shr %eax
> > 44: 25 55 55 55 55 and $0x55555555,%eax
> > 49: 29 c7 sub %eax,%edi
> > 4b: 89 f8 mov %edi,%eax
> > 4d: 25 33 33 33 33 and $0x33333333,%eax
> > 52: c1 ef 02 shr $0x2,%edi
> > 55: 81 e7 33 33 33 33 and $0x33333333,%edi
> > 5b: 01 c7 add %eax,%edi
> > 5d: 89 f8 mov %edi,%eax
> > 5f: c1 e8 04 shr $0x4,%eax
> > 62: 01 f8 add %edi,%eax
> > 64: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax
> > 69: 69 c0 01 01 01 01 imul $0x1010101,%eax,%eax
> > 6f: c1 e8 18 shr $0x18,%eax
> > 72: c3 retq
> >
> > We might see relevant "optimizations" for our reference code.
>
> What's clang code for av_popcount64_c, or their builtin?
0000000000000000 <popcount64_clang>:
0: 48 89 f8 mov rax,rdi
3: 48 d1 e8 shr rax,1
6: 48 b9 55 55 55 55 55 movabs rcx,0x5555555555555555
d: 55 55 55
10: 48 21 c1 and rcx,rax
13: 48 29 cf sub rdi,rcx
16: 48 b8 33 33 33 33 33 movabs rax,0x3333333333333333
1d: 33 33 33
20: 48 89 f9 mov rcx,rdi
23: 48 21 c1 and rcx,rax
26: 48 c1 ef 02 shr rdi,0x2
2a: 48 21 c7 and rdi,rax
2d: 48 01 cf add rdi,rcx
30: 48 89 f8 mov rax,rdi
33: 48 c1 e8 04 shr rax,0x4
37: 48 01 f8 add rax,rdi
3a: 48 b9 0f 0f 0f 0f 0f movabs rcx,0xf0f0f0f0f0f0f0f
41: 0f 0f 0f
44: 48 21 c1 and rcx,rax
47: 48 b8 01 01 01 01 01 movabs rax,0x101010101010101
4e: 01 01 01
51: 48 0f af c1 imul rax,rcx
55: 48 c1 e8 38 shr rax,0x38
59: c3 ret
5a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
0000000000000060 <av_popcount64_c>:
60: 89 f8 mov eax,edi
62: d1 e8 shr eax,1
64: 25 55 55 55 55 and eax,0x55555555
69: 89 f9 mov ecx,edi
6b: 29 c1 sub ecx,eax
6d: 89 c8 mov eax,ecx
6f: 25 33 33 33 33 and eax,0x33333333
74: c1 e9 02 shr ecx,0x2
77: 81 e1 33 33 33 33 and ecx,0x33333333
7d: 01 c1 add ecx,eax
7f: 89 c8 mov eax,ecx
81: c1 e8 04 shr eax,0x4
84: 01 c8 add eax,ecx
86: 25 0f 0f 0f 0f and eax,0xf0f0f0f
8b: 89 c1 mov ecx,eax
8d: c1 e9 08 shr ecx,0x8
90: 01 c1 add ecx,eax
92: 89 c8 mov eax,ecx
94: c1 e8 10 shr eax,0x10
97: 01 c8 add eax,ecx
99: 83 e0 3f and eax,0x3f
9c: 48 89 f9 mov rcx,rdi
9f: 48 c1 e9 20 shr rcx,0x20
a3: 48 c1 ef 21 shr rdi,0x21
a7: 81 e7 55 55 55 55 and edi,0x55555555
ad: 29 f9 sub ecx,edi
af: 89 ca mov edx,ecx
b1: 81 e2 33 33 33 33 and edx,0x33333333
b7: c1 e9 02 shr ecx,0x2
ba: 81 e1 33 33 33 33 and ecx,0x33333333
c0: 01 d1 add ecx,edx
c2: 89 ca mov edx,ecx
c4: c1 ea 04 shr edx,0x4
c7: 01 ca add edx,ecx
c9: 81 e2 0f 0f 0f 0f and edx,0xf0f0f0f
cf: 89 d1 mov ecx,edx
d1: c1 e9 08 shr ecx,0x8
d4: 01 d1 add ecx,edx
d6: 89 ca mov edx,ecx
d8: c1 ea 10 shr edx,0x10
db: 01 ca add edx,ecx
dd: 83 e2 3f and edx,0x3f
e0: 01 d0 add eax,edx
e2: c3 ret
> We're currently calling av_popcount_c twice from within av_popcount64_c,
> when on x86_64 cpus we could probably take advantage of the 64bits gprs.
>
--
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150225/e14318ec/attachment.asc>
More information about the ffmpeg-devel
mailing list