[FFmpeg-devel] [PATCH 1/3] common: add ff_parity()

Clément Bœsch u at pkh.me
Sun Jan 3 22:31:22 CET 2016

On Sun, Jan 03, 2016 at 05:56:41PM -0300, James Almer wrote:
> >> +static av_always_inline av_const int ff_parity(uint32_t v)
> >> +{
> >> +#if HAVE_PARITY
> >> +    return __builtin_parity(v);
> >> +#else
> >> +    return av_popcount(v) & 1;
> >> +#endif
> > 
> > Do compilers really generate better code for the former?
> GCC does on x86 when the target cpu doesn't support the popcnt instruction,
> otherwise the end result would be the same (popcnt + and).
> av_popcount_c() is not optimal for this.

For the record, this is what it looks like here (GCC 5.3.0, clang 3.7.0,

[/tmp]☭ cat a.c
#include <stdint.h>

int parity0(uint32_t x) { return __builtin_popcount(x) & 1; }
int parity1(uint32_t x) { return __builtin_parity(x); }
[/tmp]☭ gcc -O2 -c a.c && objdump -r -d -Mintel a.o

a.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <parity0>:
   0:	48 83 ec 08          	sub    rsp,0x8
   4:	89 ff                	mov    edi,edi
   6:	e8 00 00 00 00       	call   b <parity0+0xb>
			7: R_X86_64_PC32	__popcountdi2-0x4
   b:	48 83 c4 08          	add    rsp,0x8
   f:	83 e0 01             	and    eax,0x1
  12:	c3                   	ret    
  13:	0f 1f 00             	nop    DWORD PTR [rax]
  16:	66 2e 0f 1f 84 00 00 	nop    WORD PTR cs:[rax+rax*1+0x0]
  1d:	00 00 00 

0000000000000020 <parity1>:
  20:	89 f8                	mov    eax,edi
  22:	c1 ef 10             	shr    edi,0x10
  25:	31 f8                	xor    eax,edi
  27:	30 e0                	xor    al,ah
  29:	0f 9b c0             	setnp  al
  2c:	0f b6 c0             	movzx  eax,al
  2f:	c3                   	ret    
[/tmp]☭ clang -O2 -c a.c && objdump -r -d -Mintel a.o

a.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <parity0>:
   0:	89 f8                	mov    eax,edi
   2:	d1 e8                	shr    eax,1
   4:	25 55 55 55 55       	and    eax,0x55555555
   9:	29 c7                	sub    edi,eax
   b:	89 f8                	mov    eax,edi
   d:	25 33 33 33 33       	and    eax,0x33333333
  12:	c1 ef 02             	shr    edi,0x2
  15:	81 e7 33 33 33 33    	and    edi,0x33333333
  1b:	01 c7                	add    edi,eax
  1d:	89 f8                	mov    eax,edi
  1f:	c1 e8 04             	shr    eax,0x4
  22:	01 f8                	add    eax,edi
  24:	25 0f 0f 0f 01       	and    eax,0x10f0f0f
  29:	69 c0 01 01 01 01    	imul   eax,eax,0x1010101
  2f:	c1 e8 18             	shr    eax,0x18
  32:	83 e0 01             	and    eax,0x1
  35:	c3                   	ret    
  36:	66 2e 0f 1f 84 00 00 	nop    WORD PTR cs:[rax+rax*1+0x0]
  3d:	00 00 00 

0000000000000040 <parity1>:
  40:	89 f8                	mov    eax,edi
  42:	d1 e8                	shr    eax,1
  44:	25 55 55 55 55       	and    eax,0x55555555
  49:	29 c7                	sub    edi,eax
  4b:	89 f8                	mov    eax,edi
  4d:	25 33 33 33 33       	and    eax,0x33333333
  52:	c1 ef 02             	shr    edi,0x2
  55:	81 e7 33 33 33 33    	and    edi,0x33333333
  5b:	01 c7                	add    edi,eax
  5d:	89 f8                	mov    eax,edi
  5f:	c1 e8 04             	shr    eax,0x4
  62:	01 f8                	add    eax,edi
  64:	25 0f 0f 0f 01       	and    eax,0x10f0f0f
  69:	69 c0 01 01 01 01    	imul   eax,eax,0x1010101
  6f:	c1 e8 18             	shr    eax,0x18
  72:	83 e0 01             	and    eax,0x1
  75:	c3                   	ret    

Conclusion: with GCC it matters, not so much with Clang.

Clément B.
