[MPlayer-dev-eng] [RFC] emms/sfence in fast memcpy()

Michael Niedermayer michaelni at gmx.at
Mon May 28 10:34:41 CEST 2007


On Mon, May 28, 2007 at 12:40:21AM +0200, Reimar Döffinger wrote:
> Hello,
> On Sun, May 27, 2007 at 11:53:39PM +0200, Michael Niedermayer wrote:
> > current fast memcpy (agp and normal) variants do emms and sfence at the
> > end, this is slow and particualry annoying if the copy is called once per
> > line which it is in many cases
> > i think that something should be done about that, though iam not
> > volunteering to do the work
> > 
> > if we didnt do this silly redefine memcpy() then this would be easier ...
> > a mp/av_memcpy() could simply leave the emms/sfence to the caller
> 
> I have already done most of the work to make all relevant places use
> fast_memcpy explicitly, see attached patch.
> In about 5 places it is obviously nonsense to use fast_memcpy, I would
> replaces these in a second patch. There are also two fixmes in

5 ?

[...]
> Index: libmpcodecs/vf_yadif.c
> ===================================================================
> --- libmpcodecs/vf_yadif.c	(revision 23390)
> +++ libmpcodecs/vf_yadif.c	(working copy)
> @@ -62,7 +62,7 @@
>  static void store_ref(struct vf_priv_s *p, uint8_t *src[3], int src_stride[3], int width, int height){
>      int i;
>  
> -    memcpy (p->ref[3], p->ref[0], sizeof(uint8_t *)*3);
> +    fast_memcpy (p->ref[3], p->ref[0], sizeof(uint8_t *)*3);
>      memmove(p->ref[0], p->ref[1], sizeof(uint8_t *)*3*3);

12 bytes ...


[...]
> Index: libmpcodecs/vd_mtga.c
> ===================================================================
> --- libmpcodecs/vd_mtga.c	(revision 23390)
> +++ libmpcodecs/vd_mtga.c	(working copy)
> @@ -110,20 +110,20 @@
>  	    
>  	    if (packet_header & 0x80) /* runlength encoded packet */
>  	    {
> -		memcpy(final, data, num_bytes);
> +		fast_memcpy(final, data, num_bytes);

<= 4 bytes


>  		
>  		// Note: this will be slow when DR to vram!
>  		i=num_bytes;
>  		while(2*i<=replen){
> -		    memcpy(final+i,final,i);
> +		    fast_memcpy(final+i,final,i);
>  		    i*=2;
>  		}

is being read immedeatly afterwards ...



[...]
> Index: libao2/ao_dsound.c
> ===================================================================
> --- libao2/ao_dsound.c	(revision 23390)
> +++ libao2/ao_dsound.c	(working copy)
> @@ -199,7 +199,7 @@
>      if(device_num==*device_index){
>          mp_msg(MSGT_AO, MSGL_V,"<--");
>          if(guid){
> -            memcpy(&device,guid,sizeof(GUID));
> +            fast_memcpy(&device,guid,sizeof(GUID));
>          }

i dont think GUID is that big ...


>      }
>      mp_msg(MSGT_AO, MSGL_V,"\n");
> @@ -337,14 +337,14 @@
>    	    numsamp = dwBytes1 / (ao_data.channels * sampsize);  // number of samples for each channel in this buffer
>  
>    	    for( i = 0; i < numsamp; i++ ) for( j = 0; j < ao_data.channels; j++ ) {
> -  	        memcpy(lpvPtr1+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
> +  	        fast_memcpy(lpvPtr1+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
>    	    }
>  
>    	    if (NULL != lpvPtr2 )
>    	    {
>    	        numsamp = dwBytes2 / (ao_data.channels * sampsize);
>    	        for( i = 0; i < numsamp; i++ ) for( j = 0; j < ao_data.channels; j++ ) {
> -  	            memcpy(lpvPtr2+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+dwBytes1+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
> +  	            fast_memcpy(lpvPtr2+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+dwBytes1+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
if sampsize is what i think it is (size of one sample) then this is not
a good idea


[...]
> Index: mp3lib/sr1.c
> ===================================================================
> --- mp3lib/sr1.c	(revision 23390)
> +++ mp3lib/sr1.c	(working copy)
> @@ -172,7 +172,7 @@
>  //  if(backstep!=512 && backstep>fsizeold)
>  //    printf("\rWarning! backstep (%d>%d)                                         \n",backstep,fsizeold);
>    wordpointer = bsbuf + ssize - backstep;
> -  if (backstep) memcpy(wordpointer,bsbufold+fsizeold-backstep,backstep);
> +  if (backstep) fast_memcpy(wordpointer,bsbufold+fsizeold-backstep,backstep);
>    bitindex = 0;

i think the dst will be read quickly after this


[...]
> Index: libvo/vo_vesa.c
> ===================================================================
> --- libvo/vo_vesa.c	(revision 23390)
> +++ libvo/vo_vesa.c	(working copy)
> @@ -217,7 +217,7 @@
>  	color = (r << shift_r) | (g << shift_g) | (b << shift_b);
>  	offset = y * bpl + (x * pixel_size);
>          if(!VALID_WIN_FRAME(offset)) __vbeSwitchBank(offset);
> -	memcpy(VIDEO_PTR(offset), &color, pixel_size);
> +	fast_memcpy(VIDEO_PTR(offset), &color, pixel_size);
>  }
>  
>  /*
> @@ -226,7 +226,7 @@
[...]
> @@ -649,7 +649,7 @@
>  	  else          fs_mode = 1;
>  	} 
>  	if((err=vbeInit()) != VBE_OK) { PRINT_VBE_ERR("vbeInit",err); return -1; }
> -	memcpy(vib.VESASignature,"VBE2",4);
> +	fast_memcpy(vib.VESASignature,"VBE2",4);
>  	if(!vib_set && (err=vbeGetControllerInfo(&vib)) != VBE_OK)

size=4


>  	{
>  	  PRINT_VBE_ERR("vbeGetControllerInfo",err);
> Index: libvo/vo_zr2.c
> ===================================================================
> --- libvo/vo_zr2.c	(revision 23390)
> +++ libvo/vo_zr2.c	(working copy)
[...]
> @@ -394,7 +394,7 @@
>  	 * We make configuration changes to a temporary params structure,
>  	 * compare it with the old params structure and only apply the new
>  	 * config if it is different from the old one. */
> -	memcpy(&zptmp, &p->zp, sizeof(zptmp));
> +	fast_memcpy(&zptmp, &p->zp, sizeof(zptmp));


this is accessed immedeatly afterwards so shouldnt be non temporal


>  
>  	/* translate the configuration to zoran understandable format */
>  	zptmp.decimation = 0;
> @@ -423,7 +423,7 @@
>  
>  	if (memcmp(&zptmp, &p->zp, sizeof(zptmp))) {
>  		/* config differs, we must update */
> -		memcpy(&p->zp, &zptmp, sizeof(zptmp));
> +		fast_memcpy(&p->zp, &zptmp, sizeof(zptmp));
>  		stop_playing(p);

dst is in cache so non temporal writes (and fast_memcpy) makes no sense



[...]

> @@ -1106,7 +1106,7 @@
>      },*p;
>  
>      p = malloc(sizeof(m));
> -    memcpy(p,m,sizeof(m));
> +    fast_memcpy(p,m,sizeof(m));
>      return p;

look silly


[...]
> Index: libmenu/vf_menu.c
> ===================================================================
> --- libmenu/vf_menu.c	(revision 23390)
> +++ libmenu/vf_menu.c	(working copy)
> @@ -146,8 +146,8 @@
>  
>    if(mpi->type == MP_IMGTYPE_TEMP && (!(mpi->flags&MP_IMGFLAG_PRESERVE)) ) {
>      dmpi = vf_get_image(vf->next,mpi->imgfmt,mpi->type, mpi->flags, mpi->w, mpi->h);
> -    memcpy(mpi->planes,dmpi->planes,MP_MAX_PLANES*sizeof(unsigned char*));
> -    memcpy(mpi->stride,dmpi->stride,MP_MAX_PLANES*sizeof(unsigned int));
> +    fast_memcpy(mpi->planes,dmpi->planes,MP_MAX_PLANES*sizeof(unsigned char*));
> +    fast_memcpy(mpi->stride,dmpi->stride,MP_MAX_PLANES*sizeof(unsigned int));

silly MP_MAX_PLANES*sizeof(unsigned char*) is 16 or 32


[...]
> Index: mencoder.c
> ===================================================================
> --- mencoder.c	(revision 23390)
> +++ mencoder.c	(working copy)
[...]
> @@ -771,7 +771,7 @@
>  	if (!curfile) {
>  		if (sh_video->bih) {
>  			mux_v->bih=malloc(sh_video->bih->biSize);
> -			memcpy(mux_v->bih, sh_video->bih, sh_video->bih->biSize);
> +			fast_memcpy(mux_v->bih, sh_video->bih, sh_video->bih->biSize);
>  		}
>      else
>      {
> @@ -941,7 +941,7 @@
>      }
>      if (sh_audio->wf){
>  	mux_a->wf=malloc(sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
> -	memcpy(mux_a->wf, sh_audio->wf, sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
> +	fast_memcpy(mux_a->wf, sh_audio->wf, sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
>  	if(!sh_audio->i_bps) sh_audio->i_bps=mux_a->wf->nAvgBytesPerSec;
>      } else {
>  	mux_a->wf = malloc(sizeof(WAVEFORMATEX));

once executed initalization ...

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The worst form of inequality is to try to make unequal things equal.
-- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/attachments/20070528/c9a25ce5/attachment.pgp>


More information about the MPlayer-dev-eng mailing list