[MPlayer-dev-eng] [RFC] emms/sfence in fast memcpy()
Michael Niedermayer
michaelni at gmx.at
Mon May 28 10:34:41 CEST 2007
On Mon, May 28, 2007 at 12:40:21AM +0200, Reimar Döffinger wrote:
> Hello,
> On Sun, May 27, 2007 at 11:53:39PM +0200, Michael Niedermayer wrote:
> > current fast memcpy (agp and normal) variants do emms and sfence at the
> > end, this is slow and particualry annoying if the copy is called once per
> > line which it is in many cases
> > i think that something should be done about that, though iam not
> > volunteering to do the work
> >
> > if we didnt do this silly redefine memcpy() then this would be easier ...
> > a mp/av_memcpy() could simply leave the emms/sfence to the caller
>
> I have already done most of the work to make all relevant places use
> fast_memcpy explicitly, see attached patch.
> In about 5 places it is obviously nonsense to use fast_memcpy, I would
> replaces these in a second patch. There are also two fixmes in
5 ?
[...]
> Index: libmpcodecs/vf_yadif.c
> ===================================================================
> --- libmpcodecs/vf_yadif.c (revision 23390)
> +++ libmpcodecs/vf_yadif.c (working copy)
> @@ -62,7 +62,7 @@
> static void store_ref(struct vf_priv_s *p, uint8_t *src[3], int src_stride[3], int width, int height){
> int i;
>
> - memcpy (p->ref[3], p->ref[0], sizeof(uint8_t *)*3);
> + fast_memcpy (p->ref[3], p->ref[0], sizeof(uint8_t *)*3);
> memmove(p->ref[0], p->ref[1], sizeof(uint8_t *)*3*3);
12 bytes ...
[...]
> Index: libmpcodecs/vd_mtga.c
> ===================================================================
> --- libmpcodecs/vd_mtga.c (revision 23390)
> +++ libmpcodecs/vd_mtga.c (working copy)
> @@ -110,20 +110,20 @@
>
> if (packet_header & 0x80) /* runlength encoded packet */
> {
> - memcpy(final, data, num_bytes);
> + fast_memcpy(final, data, num_bytes);
<= 4 bytes
>
> // Note: this will be slow when DR to vram!
> i=num_bytes;
> while(2*i<=replen){
> - memcpy(final+i,final,i);
> + fast_memcpy(final+i,final,i);
> i*=2;
> }
is being read immedeatly afterwards ...
[...]
> Index: libao2/ao_dsound.c
> ===================================================================
> --- libao2/ao_dsound.c (revision 23390)
> +++ libao2/ao_dsound.c (working copy)
> @@ -199,7 +199,7 @@
> if(device_num==*device_index){
> mp_msg(MSGT_AO, MSGL_V,"<--");
> if(guid){
> - memcpy(&device,guid,sizeof(GUID));
> + fast_memcpy(&device,guid,sizeof(GUID));
> }
i dont think GUID is that big ...
> }
> mp_msg(MSGT_AO, MSGL_V,"\n");
> @@ -337,14 +337,14 @@
> numsamp = dwBytes1 / (ao_data.channels * sampsize); // number of samples for each channel in this buffer
>
> for( i = 0; i < numsamp; i++ ) for( j = 0; j < ao_data.channels; j++ ) {
> - memcpy(lpvPtr1+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
> + fast_memcpy(lpvPtr1+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
> }
>
> if (NULL != lpvPtr2 )
> {
> numsamp = dwBytes2 / (ao_data.channels * sampsize);
> for( i = 0; i < numsamp; i++ ) for( j = 0; j < ao_data.channels; j++ ) {
> - memcpy(lpvPtr2+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+dwBytes1+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
> + fast_memcpy(lpvPtr2+(i*ao_data.channels*sampsize)+(chantable[j]*sampsize),data+dwBytes1+(i*ao_data.channels*sampsize)+(j*sampsize),sampsize);
if sampsize is what i think it is (size of one sample) then this is not
a good idea
[...]
> Index: mp3lib/sr1.c
> ===================================================================
> --- mp3lib/sr1.c (revision 23390)
> +++ mp3lib/sr1.c (working copy)
> @@ -172,7 +172,7 @@
> // if(backstep!=512 && backstep>fsizeold)
> // printf("\rWarning! backstep (%d>%d) \n",backstep,fsizeold);
> wordpointer = bsbuf + ssize - backstep;
> - if (backstep) memcpy(wordpointer,bsbufold+fsizeold-backstep,backstep);
> + if (backstep) fast_memcpy(wordpointer,bsbufold+fsizeold-backstep,backstep);
> bitindex = 0;
i think the dst will be read quickly after this
[...]
> Index: libvo/vo_vesa.c
> ===================================================================
> --- libvo/vo_vesa.c (revision 23390)
> +++ libvo/vo_vesa.c (working copy)
> @@ -217,7 +217,7 @@
> color = (r << shift_r) | (g << shift_g) | (b << shift_b);
> offset = y * bpl + (x * pixel_size);
> if(!VALID_WIN_FRAME(offset)) __vbeSwitchBank(offset);
> - memcpy(VIDEO_PTR(offset), &color, pixel_size);
> + fast_memcpy(VIDEO_PTR(offset), &color, pixel_size);
> }
>
> /*
> @@ -226,7 +226,7 @@
[...]
> @@ -649,7 +649,7 @@
> else fs_mode = 1;
> }
> if((err=vbeInit()) != VBE_OK) { PRINT_VBE_ERR("vbeInit",err); return -1; }
> - memcpy(vib.VESASignature,"VBE2",4);
> + fast_memcpy(vib.VESASignature,"VBE2",4);
> if(!vib_set && (err=vbeGetControllerInfo(&vib)) != VBE_OK)
size=4
> {
> PRINT_VBE_ERR("vbeGetControllerInfo",err);
> Index: libvo/vo_zr2.c
> ===================================================================
> --- libvo/vo_zr2.c (revision 23390)
> +++ libvo/vo_zr2.c (working copy)
[...]
> @@ -394,7 +394,7 @@
> * We make configuration changes to a temporary params structure,
> * compare it with the old params structure and only apply the new
> * config if it is different from the old one. */
> - memcpy(&zptmp, &p->zp, sizeof(zptmp));
> + fast_memcpy(&zptmp, &p->zp, sizeof(zptmp));
this is accessed immedeatly afterwards so shouldnt be non temporal
>
> /* translate the configuration to zoran understandable format */
> zptmp.decimation = 0;
> @@ -423,7 +423,7 @@
>
> if (memcmp(&zptmp, &p->zp, sizeof(zptmp))) {
> /* config differs, we must update */
> - memcpy(&p->zp, &zptmp, sizeof(zptmp));
> + fast_memcpy(&p->zp, &zptmp, sizeof(zptmp));
> stop_playing(p);
dst is in cache so non temporal writes (and fast_memcpy) makes no sense
[...]
> @@ -1106,7 +1106,7 @@
> },*p;
>
> p = malloc(sizeof(m));
> - memcpy(p,m,sizeof(m));
> + fast_memcpy(p,m,sizeof(m));
> return p;
look silly
[...]
> Index: libmenu/vf_menu.c
> ===================================================================
> --- libmenu/vf_menu.c (revision 23390)
> +++ libmenu/vf_menu.c (working copy)
> @@ -146,8 +146,8 @@
>
> if(mpi->type == MP_IMGTYPE_TEMP && (!(mpi->flags&MP_IMGFLAG_PRESERVE)) ) {
> dmpi = vf_get_image(vf->next,mpi->imgfmt,mpi->type, mpi->flags, mpi->w, mpi->h);
> - memcpy(mpi->planes,dmpi->planes,MP_MAX_PLANES*sizeof(unsigned char*));
> - memcpy(mpi->stride,dmpi->stride,MP_MAX_PLANES*sizeof(unsigned int));
> + fast_memcpy(mpi->planes,dmpi->planes,MP_MAX_PLANES*sizeof(unsigned char*));
> + fast_memcpy(mpi->stride,dmpi->stride,MP_MAX_PLANES*sizeof(unsigned int));
silly MP_MAX_PLANES*sizeof(unsigned char*) is 16 or 32
[...]
> Index: mencoder.c
> ===================================================================
> --- mencoder.c (revision 23390)
> +++ mencoder.c (working copy)
[...]
> @@ -771,7 +771,7 @@
> if (!curfile) {
> if (sh_video->bih) {
> mux_v->bih=malloc(sh_video->bih->biSize);
> - memcpy(mux_v->bih, sh_video->bih, sh_video->bih->biSize);
> + fast_memcpy(mux_v->bih, sh_video->bih, sh_video->bih->biSize);
> }
> else
> {
> @@ -941,7 +941,7 @@
> }
> if (sh_audio->wf){
> mux_a->wf=malloc(sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
> - memcpy(mux_a->wf, sh_audio->wf, sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
> + fast_memcpy(mux_a->wf, sh_audio->wf, sizeof(WAVEFORMATEX) + sh_audio->wf->cbSize);
> if(!sh_audio->i_bps) sh_audio->i_bps=mux_a->wf->nAvgBytesPerSec;
> } else {
> mux_a->wf = malloc(sizeof(WAVEFORMATEX));
once executed initalization ...
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
The worst form of inequality is to try to make unequal things equal.
-- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/attachments/20070528/c9a25ce5/attachment.pgp>
More information about the MPlayer-dev-eng
mailing list