[Mplayer-cvslog] CVS: main/libvo fastmemcpy.h,1.9,1.10

Sun Apr 22 21:25:49 CEST 2001

Update of /cvsroot/mplayer/main/libvo
In directory usw-pr-cvs1:/tmp/cvs-serv28452

Modified Files:
	fastmemcpy.h 
Log Message:
New optimized SSE code, overall optimizations.


Index: fastmemcpy.h
===================================================================
RCS file: /cvsroot/mplayer/main/libvo/fastmemcpy.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -r1.9 -r1.10
*** fastmemcpy.h	2001/04/21 21:49:28	1.9
--- fastmemcpy.h	2001/04/22 19:25:47	1.10
***************
*** 1,3 ****
- 
  #ifndef __MPLAYER_MEMCPY
  #define __MPLAYER_MEMCPY
--- 1,2 ----
***************
*** 5,11 ****
  /*
   This part of code was taken by from Linux-2.4.3 and slightly modified
! for MMX2 instruction set. I have done it since linux uses page aligned
  blocks but mplayer uses weakly ordered data and original sources can not
! speedup their. Only using prefetchnta and movntq together have effect! 
  If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
  */
--- 4,45 ----
  /*
   This part of code was taken by from Linux-2.4.3 and slightly modified
! for MMX2, SSE instruction set. I have done it since linux uses page aligned
  blocks but mplayer uses weakly ordered data and original sources can not
! speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
! 
! From IA-32 Intel Architecture Software Developer's Manual Volume 1,
! Order Number 245470:
! "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
! 
! Data referenced by a program can be temporal (data will be used again) or
! non-temporal (data will be referenced once and not reused in the immediate
! future). To make efficient use of the processor's caches, it is generally
! desirable to cache temporal data and not cache non-temporal data. Overloading
! the processor's caches with non-temporal data is sometimes referred to as
! "polluting the caches". 
! The non-temporal data is written to memory with Write-Combining semantics.
! 
! The PREFETCHh instructions permits a program to load data into the processor
! at a suggested cache level, so that it is closer to the processors load and
! store unit when it is needed. If the data is already present in a level of
! the cache hierarchy that is closer to the processor, the PREFETCHh instruction
! will not result in any data movement.
! But we should you PREFETCHNTA: Non-temporal data fetch data into location
! close to the processor, minimizing cache pollution.
! 
! The MOVNTQ (store quadword using non-temporal hint) instruction stores
! packed integer data from an MMX register to memory, using a non-temporal hint.
! The MOVNTPS (store packed single-precision floating-point values using
! non-temporal hint) instruction stores packed floating-point data from an
! XMM register to memory, using a non-temporal hint.
! 
! The SFENCE (Store Fence) instruction controls write ordering by creating a
! fence for memory store operations. This instruction guarantees that the results
! of every store instruction that precedes the store fence in program order is
! globally visible before any store instruction that follows the fence. The
! SFENCE instruction provides an efficient way of ensuring ordering between
! procedures that produce weakly-ordered data and procedures that consume that
! data.
! 
  If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
  */
***************
*** 16,19 ****
--- 50,58 ----
  #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW )
  
+ #undef HAVE_K6_2PLUS
+ #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
+ #define HAVE_K6_2PLUS
+ #endif
+ 
  /* for small memory blocks (<256 bytes) this version is faster */
  #define small_memcpy(to,from,n)\
***************
*** 30,39 ****
  	int i;
  
-         
  #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
  //        printf("fastmemcpy_pre(0x%X,0x%X,0x%X)\n",to,from,len);
          // Align dest to 16-byte boundary:
!         if((unsigned int)to&15){
!           int len2=16-((unsigned int)to&15);
            if(len>len2){
              len-=len2;
--- 69,77 ----
  	int i;
  
  #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
  //        printf("fastmemcpy_pre(0x%X,0x%X,0x%X)\n",to,from,len);
          // Align dest to 16-byte boundary:
!         if((unsigned long)to&15){
!           int len2=16-((unsigned long)to&15);
            if(len>len2){
              len-=len2;
***************
*** 47,51 ****
  //        printf("fastmemcpy(0x%X,0x%X,0x%X)\n",to,from,len);
  #endif
-     
  
          if(len >= 0x200) /* 512-byte blocks */
--- 85,88 ----
***************
*** 56,60 ****
  	  
  	__asm__ __volatile__ (
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
  	        "prefetch (%0)\n"
  	        "prefetch 64(%0)\n"
--- 93,97 ----
  	  
  	__asm__ __volatile__ (
! #ifdef HAVE_K6_2PLUS
  	        "prefetch (%0)\n"
  	        "prefetch 64(%0)\n"
***************
*** 62,66 ****
          	"prefetch 192(%0)\n"
          	"prefetch 256(%0)\n"
! #else
  		"prefetchnta (%0)\n"
  		"prefetchnta 64(%0)\n"
--- 99,103 ----
          	"prefetch 192(%0)\n"
          	"prefetch 256(%0)\n"
! #else /* K7, P3, CyrixIII */
  		"prefetchnta (%0)\n"
  		"prefetchnta 64(%0)\n"
***************
*** 79,91 ****
             processor's decoders, but it's not always possible.
          */
  	for(; i>0; i--)
  	{
  		__asm__ __volatile__ (
- #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
-         	"prefetch 320(%0)\n"
- #else
  		"prefetchnta 320(%0)\n"
- #endif
- #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
  		"movups (%0), %%xmm0\n"
  		"movups 16(%0), %%xmm1\n"
--- 116,126 ----
             processor's decoders, but it's not always possible.
          */
+ #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
+ 	if(((unsigned long)from) & 15)
+ 	/* if SRC is misaligned */
  	for(; i>0; i--)
  	{
  		__asm__ __volatile__ (
  		"prefetchnta 320(%0)\n"
  		"movups (%0), %%xmm0\n"
  		"movups 16(%0), %%xmm1\n"
***************
*** 95,101 ****
  		"movups 48(%0), %%xmm1\n"
  		"movntps %%xmm0, 32(%1)\n"
  		"movntps %%xmm1, 48(%1)\n"
! #else /* Only K7 (may be other) */
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
          	"movq (%0), %%mm0\n"
          	"movq 8(%0), %%mm1\n"
--- 130,170 ----
  		"movups 48(%0), %%xmm1\n"
  		"movntps %%xmm0, 32(%1)\n"
+ 		"movntps %%xmm1, 48(%1)\n"
+ 		:: "r" (from), "r" (to) : "memory");
+ 		from+=64;
+ 		to+=64;
+ 	}
+ 	else 
+ 	/*
+ 	   Only if SRC is aligned on 16-byte boundary.
+ 	   It allows to use movaps instead of movups, which required data
+ 	   to be aligned or a general-protection exception (#GP) is generated.
+ 	*/
+ 	for(; i>0; i--)
+ 	{
+ 		__asm__ __volatile__ (
+ 		"prefetchnta 320(%0)\n"
+ 		"movaps (%0), %%xmm0\n"
+ 		"movaps 16(%0), %%xmm1\n"
+ 		"movntps %%xmm0, (%1)\n"
+ 		"movntps %%xmm1, 16(%1)\n"
+ 		"movaps 32(%0), %%xmm0\n"
+ 		"movaps 48(%0), %%xmm1\n"
+ 		"movntps %%xmm0, 32(%1)\n"
  		"movntps %%xmm1, 48(%1)\n"
! 		:: "r" (from), "r" (to) : "memory");
! 		from+=64;
! 		to+=64;
! 	}
! #else
! 	for(; i>0; i--)
! 	{
! 		__asm__ __volatile__ (
! #ifdef HAVE_K6_2PLUS
!         	"prefetch 320(%0)\n"
! #else
! 		"prefetchnta 320(%0)\n"
! #endif
! #ifdef HAVE_K6_2PLUS
          	"movq (%0), %%mm0\n"
          	"movq 8(%0), %%mm1\n"
***************
*** 114,118 ****
          	"movq %%mm2, 48(%1)\n"
          	"movq %%mm3, 56(%1)\n"
! #else
  		"movq (%0), %%mm0\n"
  		"movq 8(%0), %%mm1\n"
--- 183,187 ----
          	"movq %%mm2, 48(%1)\n"
          	"movq %%mm3, 56(%1)\n"
! #else /* K7 */
  		"movq (%0), %%mm0\n"
  		"movq 8(%0), %%mm1\n"
***************
*** 132,144 ****
  		"movntq %%mm3, 56(%1)\n"
  #endif
- #endif
  		:: "r" (from), "r" (to) : "memory");
  		from+=64;
  		to+=64;
  	}
! #if defined( HAVE_3DNOW ) && !defined( HAVE_MMX2 )
  		__asm__ __volatile__ ("femms":::"memory");
! #else
  		__asm__ __volatile__ ("emms":::"memory");
  #endif
  	}
--- 201,221 ----
  		"movntq %%mm3, 56(%1)\n"
  #endif
  		:: "r" (from), "r" (to) : "memory");
  		from+=64;
  		to+=64;
  	}
! #endif /* Have SSE */
! #ifdef HAVE_K6_2PLUS
!                 /* On K6 femms is fatser of emms.
! 		   On K7 femms is directly mapped on emms. */
  		__asm__ __volatile__ ("femms":::"memory");
! #else /* K7, P3, CyrixIII */
!                 /* since movntq is weakly-ordered, a "sfence"
! 		 * is needed to become ordered again. */
! 		__asm__ __volatile__ ("sfence":::"memory");
! #ifndef HAVE_SSE		
! 		/* enables to use FPU */
  		__asm__ __volatile__ ("emms":::"memory");
+ #endif		
  #endif
  	}
***************
*** 150,154 ****
  }
  #define memcpy(a,b,c) fast_memcpy(a,b,c)
- 
  #undef small_memcpy
  
--- 227,230 ----


_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog