Update of /cvsroot/mplayer/main/libvo In directory usw-pr-cvs1:/tmp/cvs-serv9489 Modified Files: fastmemcpy.h Log Message: - applied SSE patch by Nick Kurshev Index: fastmemcpy.h =================================================================== RCS file: /cvsroot/mplayer/main/libvo/fastmemcpy.h,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -r1.4 -r1.5 *** fastmemcpy.h 2001/04/12 14:40:10 1.4 --- fastmemcpy.h 2001/04/14 17:56:44 1.5 *************** *** 28,85 **** __asm__ __volatile__ ( ! "1: prefetchnta (%0)\n" /* This set is 28 bytes */ ! " prefetchnta 64(%0)\n" ! " prefetchnta 128(%0)\n" ! " prefetchnta 192(%0)\n" ! " prefetchnta 256(%0)\n" ! #if 0 ! "2: \n" ! ".section .fixup, \"ax\"\n" ! "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ! " jmp 2b\n" ! ".previous\n" ! ".section __ex_table,\"a\"\n" ! " .align 4\n" ! " .long 1b, 3b\n" ! ".previous" ! #endif : : "r" (from) ); ! ! for(; i>0; i--) { __asm__ __volatile__ ( ! "1: prefetchnta 320(%0)\n" ! "2: movq (%0), %%mm0\n" ! " movq 8(%0), %%mm1\n" ! " movq 16(%0), %%mm2\n" ! " movq 24(%0), %%mm3\n" ! " movntq %%mm0, (%1)\n" ! " movntq %%mm1, 8(%1)\n" ! " movntq %%mm2, 16(%1)\n" ! " movntq %%mm3, 24(%1)\n" ! " movq 32(%0), %%mm0\n" ! " movq 40(%0), %%mm1\n" ! " movq 48(%0), %%mm2\n" ! " movq 56(%0), %%mm3\n" ! " movntq %%mm0, 32(%1)\n" ! " movntq %%mm1, 40(%1)\n" ! " movntq %%mm2, 48(%1)\n" ! " movntq %%mm3, 56(%1)\n" ! #if 0 ! ".section .fixup, \"ax\"\n" ! "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ! " jmp 2b\n" ! ".previous\n" ! ".section __ex_table,\"a\"\n" ! " .align 4\n" ! " .long 1b, 3b\n" ! ".previous" ! #endif ! : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } ! __asm__ __volatile__ ("emms":::"memory"); } /* --- 28,82 ---- __asm__ __volatile__ ( ! "prefetchnta (%0)\n" ! "prefetchnta 64(%0)\n" ! "prefetchnta 128(%0)\n" ! "prefetchnta 192(%0)\n" ! "prefetchnta 256(%0)\n" : : "r" (from) ); ! /* ! This algorithm is top effective when the code consequently ! reads and writes blocks which have size of cache line. ! Size of cache line is processor-dependent. ! It will, however, be a minimum of 32 bytes on any processors. ! It would be better to have a number of instructions which ! perform reading and writing to be multiple to a number of ! processor's decoders, but it's not always possible. ! */ for(; i>0; i--) { __asm__ __volatile__ ( ! "prefetchnta 320(%0)\n" ! #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ ! "movups (%0), %%xmm0\n" ! "movups 16(%0), %%xmm1\n" ! "movntps %%xmm0, (%1)\n" ! "movntps %%xmm1, 16(%1)\n" ! "movups 32(%0), %%xmm0\n" ! "movups 48(%0), %%xmm1\n" ! "movntps %%xmm0, 32(%1)\n" ! "movntps %%xmm1, 48(%1)\n" ! #else /* Only K7 (may be other) */ ! "movq (%0), %%mm0\n" ! "movq 8(%0), %%mm1\n" ! "movq 16(%0), %%mm2\n" ! "movq 24(%0), %%mm3\n" ! "movntq %%mm0, (%1)\n" ! "movntq %%mm1, 8(%1)\n" ! "movntq %%mm2, 16(%1)\n" ! "movntq %%mm3, 24(%1)\n" ! "movq 32(%0), %%mm0\n" ! "movq 40(%0), %%mm1\n" ! "movq 48(%0), %%mm2\n" ! "movq 56(%0), %%mm3\n" ! "movntq %%mm0, 32(%1)\n" ! "movntq %%mm1, 40(%1)\n" ! "movntq %%mm2, 48(%1)\n" ! "movntq %%mm3, 56(%1)\n" ! #endif ! :: "r" (from), "r" (to) : "memory"); from+=64; to+=64; } ! __asm__ __volatile__ ("emms":::"memory"); } /* _______________________________________________ Mplayer-cvslog mailing list Mplayer-cvslog@lists.sourceforge.net http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog
participants (1)
-
Felix Buenemann