[Mplayer-cvslog] CVS: main/libvo fastmemcpy.h,1.9,1.10
Felix Buenemann
atmosfear at users.sourceforge.net
Sun Apr 22 21:25:49 CEST 2001
Update of /cvsroot/mplayer/main/libvo
In directory usw-pr-cvs1:/tmp/cvs-serv28452
Modified Files:
fastmemcpy.h
Log Message:
New optimized SSE code, overall optimizations.
Index: fastmemcpy.h
===================================================================
RCS file: /cvsroot/mplayer/main/libvo/fastmemcpy.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -r1.9 -r1.10
*** fastmemcpy.h 2001/04/21 21:49:28 1.9
--- fastmemcpy.h 2001/04/22 19:25:47 1.10
***************
*** 1,3 ****
-
#ifndef __MPLAYER_MEMCPY
#define __MPLAYER_MEMCPY
--- 1,2 ----
***************
*** 5,11 ****
/*
This part of code was taken by from Linux-2.4.3 and slightly modified
! for MMX2 instruction set. I have done it since linux uses page aligned
blocks but mplayer uses weakly ordered data and original sources can not
! speedup their. Only using prefetchnta and movntq together have effect!
If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
*/
--- 4,45 ----
/*
This part of code was taken by from Linux-2.4.3 and slightly modified
! for MMX2, SSE instruction set. I have done it since linux uses page aligned
blocks but mplayer uses weakly ordered data and original sources can not
! speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
!
! From IA-32 Intel Architecture Software Developer's Manual Volume 1,
! Order Number 245470:
! "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
!
! Data referenced by a program can be temporal (data will be used again) or
! non-temporal (data will be referenced once and not reused in the immediate
! future). To make efficient use of the processor's caches, it is generally
! desirable to cache temporal data and not cache non-temporal data. Overloading
! the processor's caches with non-temporal data is sometimes referred to as
! "polluting the caches".
! The non-temporal data is written to memory with Write-Combining semantics.
!
! The PREFETCHh instructions permits a program to load data into the processor
! at a suggested cache level, so that it is closer to the processors load and
! store unit when it is needed. If the data is already present in a level of
! the cache hierarchy that is closer to the processor, the PREFETCHh instruction
! will not result in any data movement.
! But we should you PREFETCHNTA: Non-temporal data fetch data into location
! close to the processor, minimizing cache pollution.
!
! The MOVNTQ (store quadword using non-temporal hint) instruction stores
! packed integer data from an MMX register to memory, using a non-temporal hint.
! The MOVNTPS (store packed single-precision floating-point values using
! non-temporal hint) instruction stores packed floating-point data from an
! XMM register to memory, using a non-temporal hint.
!
! The SFENCE (Store Fence) instruction controls write ordering by creating a
! fence for memory store operations. This instruction guarantees that the results
! of every store instruction that precedes the store fence in program order is
! globally visible before any store instruction that follows the fence. The
! SFENCE instruction provides an efficient way of ensuring ordering between
! procedures that produce weakly-ordered data and procedures that consume that
! data.
!
If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
*/
***************
*** 16,19 ****
--- 50,58 ----
#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW )
+ #undef HAVE_K6_2PLUS
+ #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
+ #define HAVE_K6_2PLUS
+ #endif
+
/* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)\
***************
*** 30,39 ****
int i;
-
#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
// printf("fastmemcpy_pre(0x%X,0x%X,0x%X)\n",to,from,len);
// Align dest to 16-byte boundary:
! if((unsigned int)to&15){
! int len2=16-((unsigned int)to&15);
if(len>len2){
len-=len2;
--- 69,77 ----
int i;
#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
// printf("fastmemcpy_pre(0x%X,0x%X,0x%X)\n",to,from,len);
// Align dest to 16-byte boundary:
! if((unsigned long)to&15){
! int len2=16-((unsigned long)to&15);
if(len>len2){
len-=len2;
***************
*** 47,51 ****
// printf("fastmemcpy(0x%X,0x%X,0x%X)\n",to,from,len);
#endif
-
if(len >= 0x200) /* 512-byte blocks */
--- 85,88 ----
***************
*** 56,60 ****
__asm__ __volatile__ (
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
"prefetch (%0)\n"
"prefetch 64(%0)\n"
--- 93,97 ----
__asm__ __volatile__ (
! #ifdef HAVE_K6_2PLUS
"prefetch (%0)\n"
"prefetch 64(%0)\n"
***************
*** 62,66 ****
"prefetch 192(%0)\n"
"prefetch 256(%0)\n"
! #else
"prefetchnta (%0)\n"
"prefetchnta 64(%0)\n"
--- 99,103 ----
"prefetch 192(%0)\n"
"prefetch 256(%0)\n"
! #else /* K7, P3, CyrixIII */
"prefetchnta (%0)\n"
"prefetchnta 64(%0)\n"
***************
*** 79,91 ****
processor's decoders, but it's not always possible.
*/
for(; i>0; i--)
{
__asm__ __volatile__ (
- #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
- "prefetch 320(%0)\n"
- #else
"prefetchnta 320(%0)\n"
- #endif
- #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
"movups (%0), %%xmm0\n"
"movups 16(%0), %%xmm1\n"
--- 116,126 ----
processor's decoders, but it's not always possible.
*/
+ #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
+ if(((unsigned long)from) & 15)
+ /* if SRC is misaligned */
for(; i>0; i--)
{
__asm__ __volatile__ (
"prefetchnta 320(%0)\n"
"movups (%0), %%xmm0\n"
"movups 16(%0), %%xmm1\n"
***************
*** 95,101 ****
"movups 48(%0), %%xmm1\n"
"movntps %%xmm0, 32(%1)\n"
"movntps %%xmm1, 48(%1)\n"
! #else /* Only K7 (may be other) */
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
--- 130,170 ----
"movups 48(%0), %%xmm1\n"
"movntps %%xmm0, 32(%1)\n"
+ "movntps %%xmm1, 48(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ else
+ /*
+ Only if SRC is aligned on 16-byte boundary.
+ It allows to use movaps instead of movups, which required data
+ to be aligned or a general-protection exception (#GP) is generated.
+ */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ "prefetchnta 320(%0)\n"
+ "movaps (%0), %%xmm0\n"
+ "movaps 16(%0), %%xmm1\n"
+ "movntps %%xmm0, (%1)\n"
+ "movntps %%xmm1, 16(%1)\n"
+ "movaps 32(%0), %%xmm0\n"
+ "movaps 48(%0), %%xmm1\n"
+ "movntps %%xmm0, 32(%1)\n"
"movntps %%xmm1, 48(%1)\n"
! :: "r" (from), "r" (to) : "memory");
! from+=64;
! to+=64;
! }
! #else
! for(; i>0; i--)
! {
! __asm__ __volatile__ (
! #ifdef HAVE_K6_2PLUS
! "prefetch 320(%0)\n"
! #else
! "prefetchnta 320(%0)\n"
! #endif
! #ifdef HAVE_K6_2PLUS
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
***************
*** 114,118 ****
"movq %%mm2, 48(%1)\n"
"movq %%mm3, 56(%1)\n"
! #else
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
--- 183,187 ----
"movq %%mm2, 48(%1)\n"
"movq %%mm3, 56(%1)\n"
! #else /* K7 */
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
***************
*** 132,144 ****
"movntq %%mm3, 56(%1)\n"
#endif
- #endif
:: "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
__asm__ __volatile__ ("femms":::"memory");
! #else
__asm__ __volatile__ ("emms":::"memory");
#endif
}
--- 201,221 ----
"movntq %%mm3, 56(%1)\n"
#endif
:: "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
! #endif /* Have SSE */
! #ifdef HAVE_K6_2PLUS
! /* On K6 femms is fatser of emms.
! On K7 femms is directly mapped on emms. */
__asm__ __volatile__ ("femms":::"memory");
! #else /* K7, P3, CyrixIII */
! /* since movntq is weakly-ordered, a "sfence"
! * is needed to become ordered again. */
! __asm__ __volatile__ ("sfence":::"memory");
! #ifndef HAVE_SSE
! /* enables to use FPU */
__asm__ __volatile__ ("emms":::"memory");
+ #endif
#endif
}
***************
*** 150,154 ****
}
#define memcpy(a,b,c) fast_memcpy(a,b,c)
-
#undef small_memcpy
--- 227,230 ----
_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog
More information about the MPlayer-cvslog
mailing list