[FFmpeg-devel] [PATCH] VC-1 MMX DSP functions
Zuxy Meng
zuxy.meng
Sun Jul 8 17:05:07 CEST 2007
Hi,
2007/7/8, Christophe GISQUET <christophe.gisquet at free.fr>:
> Hello,
>
> Zuxy Meng a ?crit :
> > I did a quick test on 64-bit K8 tonight thanks to Stephan's testbed.
>
> And myself on a x86-64 core2 system.
>
> > The result wasn't promising. In short, from fastest to slowest:
> > MMX > SSE2 w/o sw pipeling > SSE2 w/ sw pipeling
>
> I haven't tested the mid-performer, but I can confirm this. Using
> START/STOP_TIMER, the figures are (on a 1080p sequence): ~2800
> dezicycles for MMX, ~3800 for SSE2.
I doubt if there's anything wrong. IIRC 32-bit SSE2 (w/ sw pipelining)
is faster than MMX on your Conroe. How can it be more than 25% slower
under 64-bit?
>
> > So the conclusion is that I can't make a conclusion. Any suggestions?
>
> Maybe have a look at the attached opannotate (based on 4 runs) for your
> s/w pipelined SSE2 functions?
>
> The 1/4 and 3/4 seem well pipelined, with only the output that's costly.
> However, if opannotate is to be believed (because some timings are very
> surprising), the 1/2 gets quite a lot of stalls, probably up to the
> point where they make up for most of the execution time.
>
> Best regards,
> --
> Christophe GISQUET
>
> /*
> * Command line: opannotate -a -i vc1_put_shift1_sse2,vc1_put_shift2_sse2,vc1_put_shift3_sse2
> *
> * Interpretation of command line:
> * Output annotated assembly listing with samples
> *
> * CPU: Core 2, speed 2167 MHz (estimated)
> * Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
> */
> :
> :/home/chris/src/ffmpeg/ffmpeg_g: file format elf64-x86-64
> :
> :Disassembly of section .init:
> :Disassembly of section .plt:
> :Disassembly of section .text:
> :
> 000000000050c970 <vc1_put_shift2_sse2>: /* vc1_put_shift2_sse2 total: 30389 59.9874 */
> 311 0.6139 : 50c970: mov %r9d,0xfffffffffffffffc(%rsp)
> 96 0.1895 : 50c975: mov $0x8,%eax
> : 50c97a: movslq 0x8(%rsp),%r9
> 144 0.2843 : 50c97f: sub 0xfffffffffffffffc(%rsp),%eax
> 195 0.3849 : 50c983: movslq %ecx,%rcx
> : 50c986: movslq %esi,%rsi
> 83 0.1638 : 50c989: sub %r9,%rdx
> 6 0.0118 : 50c98c: mov %eax,0xfffffffffffffffc(%rsp)
> 85 0.1678 : 50c990: lea (%r9,%r9,2),%rax
> 1 0.0020 : 50c994: movd 0xfffffffffffffffc(%rsp),%xmm7
> 386 0.7620 : 50c99a: punpcklwd %xmm7,%xmm7
> 506 0.9988 : 50c99e: pshufd $0x0,%xmm7,%xmm7
> 580 1.1449 : 50c9a3: movq (%rdx),%xmm11
> 306 0.6040 : 50c9a8: movq (%rdx,%r9,1),%xmm12
> 202 0.3987 : 50c9ae: movq (%rdx,%r9,2),%xmm5
> 201 0.3968 : 50c9b4: movq (%rdx,%rax,1),%xmm6
> 356 0.7027 : 50c9b9: punpcklbw %xmm0,%xmm11
> 37 0.0730 : 50c9be: punpcklbw %xmm0,%xmm12
> 131 0.2586 : 50c9c3: punpcklbw %xmm0,%xmm5
> 151 0.2981 : 50c9c7: punpcklbw %xmm0,%xmm6
> 156 0.3079 : 50c9cb: nopl 0x0(%rax,%rax,1)
> 1170 2.3096 : 50c9d0: add %rcx,%rdx
> 128 0.2527 : 50c9d3: movdqa %xmm11,%xmm1
> 24 0.0474 : 50c9d8: movdqa %xmm12,%xmm2
> 152 0.3000 : 50c9dd: movdqa %xmm5,%xmm3
> 1172 2.3135 : 50c9e1: movdqa %xmm6,%xmm4
> 18 0.0355 : 50c9e5: movq (%rdx),%xmm11
> 4028 7.9512 : 50c9ea: movq (%rdx,%r9,1),%xmm12
> 2040 4.0269 : 50c9f0: movq (%rdx,%r9,2),%xmm5
> 3116 6.1509 : 50c9f6: movq (%rdx,%rax,1),%xmm6
> 2812 5.5508 : 50c9fb: punpcklbw %xmm0,%xmm11
> 1291 2.5484 : 50ca00: punpcklbw %xmm0,%xmm12
> 308 0.6080 : 50ca05: punpcklbw %xmm0,%xmm5
> 1250 2.4675 : 50ca09: punpcklbw %xmm0,%xmm6
> 1436 2.8346 : 50ca0d: paddsw %xmm2,%xmm3
> 14 0.0276 : 50ca11: paddsw %xmm1,%xmm4
> 92 0.1816 : 50ca15: movdqa %xmm3,%xmm2
> 993 1.9602 : 50ca19: psllw $0x3,%xmm3
> 215 0.4244 : 50ca1e: paddw %xmm2,%xmm3
> 71 0.1402 : 50ca22: psubsw %xmm4,%xmm3
> 170 0.3356 : 50ca26: paddsw %xmm7,%xmm3
> 1113 2.1970 : 50ca2a: psraw $0x4,%xmm3
> 344 0.6791 : 50ca2f: packuswb %xmm3,%xmm3
> 3386 6.6839 : 50ca33: movq %xmm3,(%rdi)
> 1034 2.0411 : 50ca37: add %rsi,%rdi
> 8 0.0158 : 50ca3a: dec %r8d
> 50 0.0987 : 50ca3d: jne 50c9d0 <vc1_put_shift2_sse2+0x60>
> 21 0.0415 : 50ca3f: retq
> :Disassembly of section .fini:
> :
> :/home/chris/src/ffmpeg/ffmpeg_g: file format elf64-x86-64
> :
> :Disassembly of section .init:
> :Disassembly of section .plt:
> :Disassembly of section .text:
> :
> 000000000050c7c0 <vc1_put_shift1_sse2>: /* vc1_put_shift1_sse2 total: 10557 20.8393 */
> 169 0.3336 : 50c7c0: mov %r9d,0xfffffffffffffffc(%rsp)
> 61 0.1204 : 50c7c5: mov $0x20,%eax
> : 50c7ca: movslq 0x8(%rsp),%r9
> 62 0.1224 : 50c7cf: sub 0xfffffffffffffffc(%rsp),%eax
> 74 0.1461 : 50c7d3: movslq %ecx,%rcx
> : 50c7d6: movslq %esi,%rsi
> 26 0.0513 : 50c7d9: sub %r9,%rdx
> 2 0.0039 : 50c7dc: mov %eax,0xfffffffffffffffc(%rsp)
> 41 0.0809 : 50c7e0: lea (%r9,%r9,2),%rax
> : 50c7e4: movd 0xfffffffffffffffc(%rsp),%xmm7
> 191 0.3770 : 50c7ea: punpcklwd %xmm7,%xmm7
> 242 0.4777 : 50c7ee: pshufd $0x0,%xmm7,%xmm7
> 263 0.5192 : 50c7f3: movq (%rdx),%xmm11
> 205 0.4047 : 50c7f8: movq (%rdx,%r9,1),%xmm12
> 99 0.1954 : 50c7fe: movq (%rdx,%r9,2),%xmm5
> 129 0.2546 : 50c804: movq (%rdx,%rax,1),%xmm6
> 167 0.3297 : 50c809: punpcklbw %xmm0,%xmm11
> 14 0.0276 : 50c80e: punpcklbw %xmm0,%xmm12
> 78 0.1540 : 50c813: punpcklbw %xmm0,%xmm5
> 80 0.1579 : 50c817: punpcklbw %xmm0,%xmm6
> 69 0.1362 : 50c81b: nopl 0x0(%rax,%rax,1)
> 553 1.0916 : 50c820: add %rcx,%rdx
> 112 0.2211 : 50c823: movdqa %xmm11,%xmm1
> 18 0.0355 : 50c828: movdqa %xmm12,%xmm2
> 72 0.1421 : 50c82d: movdqa %xmm5,%xmm3
> 557 1.0995 : 50c831: movdqa %xmm6,%xmm4
> 13 0.0257 : 50c835: punpcklbw %xmm0,%xmm11
> 669 1.3206 : 50c83a: punpcklbw %xmm0,%xmm12
> 100 0.1974 : 50c83f: punpcklbw %xmm0,%xmm5
> 659 1.3009 : 50c843: punpcklbw %xmm0,%xmm6
> 700 1.3818 : 50c847: pmullw %xmm8,%xmm2
> 24 0.0474 : 50c84c: psllw $0x2,%xmm1
> 38 0.0750 : 50c851: pmullw %xmm9,%xmm3
> 586 1.1568 : 50c856: psubsw %xmm4,%xmm2
> 34 0.0671 : 50c85a: paddsw %xmm4,%xmm1
> 29 0.0572 : 50c85e: psubsw %xmm4,%xmm3
> 59 0.1165 : 50c862: psubsw %xmm1,%xmm2
> 606 1.1962 : 50c866: paddsw %xmm2,%xmm3
> 183 0.3612 : 50c86a: paddsw %xmm7,%xmm3
> 325 0.6415 : 50c86e: psraw $0x6,%xmm3
> 371 0.7323 : 50c873: packuswb %xmm3,%xmm3
> 2246 4.4336 : 50c877: movq %xmm3,(%rdi)
> 569 1.1232 : 50c87b: add %rsi,%rdi
> 62 0.1224 : 50c87e: dec %r8d
> : 50c881: jne 50c820 <vc1_put_shift1_sse2+0x60>
> : 50c883: retq
> : 50c884: nopw 0x0(%rax,%rax,1)
> : 50c88a: nopw 0x0(%rax,%rax,1)
> :Disassembly of section .fini:
> :
> :/home/chris/src/ffmpeg/ffmpeg_g: file format elf64-x86-64
> :
> :Disassembly of section .init:
> :Disassembly of section .plt:
> :Disassembly of section .text:
> :
> 000000000050c8a0 <vc1_put_shift3_sse2>: /* vc1_put_shift3_sse2 total: 9713 19.1733 */
> 147 0.2902 : 50c8a0: mov %r9d,0xfffffffffffffffc(%rsp)
> 57 0.1125 : 50c8a5: mov $0x20,%eax
> : 50c8aa: movslq 0x8(%rsp),%r9
> 89 0.1757 : 50c8af: sub 0xfffffffffffffffc(%rsp),%eax
> 178 0.3514 : 50c8b3: movslq %ecx,%rcx
> : 50c8b6: movslq %esi,%rsi
> 46 0.0908 : 50c8b9: sub %r9,%rdx
> 7 0.0138 : 50c8bc: mov %eax,0xfffffffffffffffc(%rsp)
> 43 0.0849 : 50c8c0: lea (%r9,%r9,2),%rax
> 9 0.0178 : 50c8c4: movd 0xfffffffffffffffc(%rsp),%xmm7
> 159 0.3139 : 50c8ca: punpcklwd %xmm7,%xmm7
> 200 0.3948 : 50c8ce: pshufd $0x0,%xmm7,%xmm7
> 242 0.4777 : 50c8d3: movq (%rdx,%rax,1),%xmm11
> 229 0.4520 : 50c8d9: movq (%rdx,%r9,2),%xmm12
> 90 0.1777 : 50c8df: movq (%rdx,%r9,1),%xmm5
> 92 0.1816 : 50c8e5: movq (%rdx),%xmm6
> 139 0.2744 : 50c8e9: punpcklbw %xmm0,%xmm11
> 6 0.0118 : 50c8ee: punpcklbw %xmm0,%xmm12
> 63 0.1244 : 50c8f3: punpcklbw %xmm0,%xmm5
> 62 0.1224 : 50c8f7: punpcklbw %xmm0,%xmm6
> 68 0.1342 : 50c8fb: nopl 0x0(%rax,%rax,1)
> 541 1.0679 : 50c900: add %rcx,%rdx
> 71 0.1402 : 50c903: movdqa %xmm11,%xmm1
> 13 0.0257 : 50c908: movdqa %xmm12,%xmm2
> 58 0.1145 : 50c90d: movdqa %xmm5,%xmm3
> 556 1.0975 : 50c911: movdqa %xmm6,%xmm4
> 15 0.0296 : 50c915: punpcklbw %xmm0,%xmm11
> 599 1.1824 : 50c91a: punpcklbw %xmm0,%xmm12
> 90 0.1777 : 50c91f: punpcklbw %xmm0,%xmm5
> 585 1.1548 : 50c923: punpcklbw %xmm0,%xmm6
> 592 1.1686 : 50c927: pmullw %xmm8,%xmm2
> 15 0.0296 : 50c92c: psllw $0x2,%xmm1
> 43 0.0849 : 50c931: pmullw %xmm9,%xmm3
> 524 1.0344 : 50c936: psubsw %xmm4,%xmm2
> 36 0.0711 : 50c93a: paddsw %xmm4,%xmm1
> 19 0.0375 : 50c93e: psubsw %xmm4,%xmm3
> 40 0.0790 : 50c942: psubsw %xmm1,%xmm2
> 542 1.0699 : 50c946: paddsw %xmm2,%xmm3
> 124 0.2448 : 50c94a: paddsw %xmm7,%xmm3
> 244 0.4817 : 50c94e: psraw $0x6,%xmm3
> 296 0.5843 : 50c953: packuswb %xmm3,%xmm3
> 2162 4.2678 : 50c957: movq %xmm3,(%rdi)
> 580 1.1449 : 50c95b: add %rsi,%rdi
> 42 0.0829 : 50c95e: dec %r8d
> : 50c961: jne 50c900 <vc1_put_shift3_sse2+0x60>
> : 50c963: retq
> : 50c964: nopw 0x0(%rax,%rax,1)
> : 50c96a: nopw 0x0(%rax,%rax,1)
> :Disassembly of section .fini:
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at mplayerhq.hu
> http://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
>
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
More information about the ffmpeg-devel
mailing list