[MPlayer-dev-eng] xvidix threading
Zuxy Meng
zuxy.meng at gmail.com
Wed Dec 26 04:52:40 CET 2007
Hi,
2007/9/16, Howard Chu <hyc at highlandsun.com>:
> (Resending after subscribing...) I've been messing around with my Dvico FusionHDTV tuner card and
> running into performance problems. Playing standard-definition channels is easy enough, but playing
> a full 1920x1080i channel was just too much. Using xvidix/radeon for output...
>
> oprofile showed that the bulk of the execution time is in memcpy, from vidix_draw_slice:
>
> CPU: AMD64 processors, speed 2600 MHz (estimated)
> Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit
> mask) count 50000
> samples % image name app name symbol name
> 179887 40.7671 mplayer mplayer fast_memcpy
> 49657 11.2536 mplayer mplayer swScale_MMX2
> 29557 6.6984 mplayer mplayer mmxext_idct
> 20101 4.5554 mplayer mplayer hcscale_MMX2
> 15602 3.5358 mplayer mplayer slice_intra_DCT
> 15397 3.4894 mplayer mplayer MC_put_o_16_mmxext
> 14941 3.3860 mplayer mplayer mpeg2_slice
> 11475 2.6005 mplayer mplayer motion_fr_frame_420
> 9799 2.2207 mplayer mplayer get_non_intra_block
>
> CPU: AMD64 processors, speed 2600 MHz (estimated)
> Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit
> mask) count 50000
> samples % image name app name symbol name
> -------------------------------------------------------------------------------
> 142 0.4817 mplayer mplayer vidix_draw_slice_420
> 29337 99.5183 mplayer mplayer fast_memcpy
> 29479 22.3475 mplayer mplayer fast_memcpy_SSE
> 29479 100.000 mplayer mplayer fast_memcpy_SSE [self]
> -------------------------------------------------------------------------------
> 73 0.9778 mplayer mplayer mpeg2_slice
> 7393 99.0222 mplayer mplayer slice_non_intra_DCT
> 7466 5.6598 mplayer mplayer get_non_intra_block
> 7466 100.000 mplayer mplayer get_non_intra_block [self]
> -------------------------------------------------------------------------------
>
> I tried scaling it down to 960x540 which was enough to bring the CPU usage down to around 95% or so,
> but it was still maxing out occasionally. Since I've got a dual core processor and the other core
> was idle, I tried moving the draw_slice function into a separate thread. This turned out to work
> pretty well; instead of maxing out at 100% of one core I can now manage higher resolutions without
> any frame losses (using both cores). (I also tried to use the -dr option but it only worked at a few
> choices of image sizes. Not sure what the story is there.)
>
> The attached diff (against current svn) is just for your consideration, for real use it would need a
> commandline switch to toggle it on/off. By the way, it seems to have some sync problems when used
> without the scale filter.
>
> --
> -- Howard Chu
> Chief Architect, Symas Corp. http://www.symas.com
> Director, Highland Sun http://highlandsun.com/hyc/
> Chief Architect, OpenLDAP http://www.openldap.org/project/
>
>
> Index: vosub_vidix.c
> ===================================================================
> --- vosub_vidix.c (revision 24537)
> +++ vosub_vidix.c (working copy)
> @@ -22,6 +22,7 @@
> #include <stdlib.h>
> #include <string.h>
> #include <errno.h>
> +#include <pthread.h>
>
> #include "config.h"
> #include "mp_msg.h"
> @@ -49,9 +50,14 @@
> static vidix_playback_t vidix_play;
> static vidix_fourcc_t vidix_fourcc;
> static vo_functions_t * vo_server;
> +static vo_functions_t vo_local;
> static vidix_yuv_t dstrides;
> /*static uint32_t (*server_control)(uint32_t request, void *data, ...);*/
>
> +static pthread_mutex_t vidix_mutex = PTHREAD_MUTEX_INITIALIZER;
> +static pthread_cond_t vidix_cond = PTHREAD_COND_INITIALIZER;
> +static pthread_t vidix_thread;
> +
> int vidix_start(void)
> {
> int err;
> @@ -96,7 +102,7 @@
> dest += dstrides.y*y + x;
> src = image[0];
> for(i=0;i<h;i++){
> - memcpy(dest,src,w);
> + mem2agpcpy(dest,src,w);
> src+=stride[0];
> dest += dstrides.y;
> }
> @@ -122,6 +128,7 @@
> src += stride[1];
> src2+= stride[2];
> }
> + i = -1;
> }
> else
> {
> @@ -130,7 +137,7 @@
> dest += dstrides.v*y/4 + x;
> src = image[1];
> for(i=0;i<h/2;i++){
> - memcpy(dest,src,w/2);
> + mem2agpcpy(dest,src,w/2);
> src+=stride[1];
> dest+=dstrides.v/2;
> }
> @@ -140,13 +147,13 @@
> dest += dstrides.u*y/4 + x;
> src = image[2];
> for(i=0;i<h/2;i++){
> - memcpy(dest,src,w/2);
> + mem2agpcpy(dest,src,w/2);
> src+=stride[2];
> dest += dstrides.u/2;
> }
> - return 0;
> + i = 0;
> }
> - return -1;
> + return i;
> }
>
> static uint32_t vidix_draw_slice_410(uint8_t *image[], int stride[], int w,int h,int x,int y)
> @@ -241,10 +248,38 @@
> return 0;
> }
>
> +static uint8_t *vds_image[3];
> +static int vds_stride[3], vds_w, vds_h, vds_x, vds_y;
> +
> +static void vidix_task()
> +{
> + pthread_mutex_lock(&vidix_mutex);
> + pthread_cond_signal(&vidix_cond);
> +
> + for(;;) {
> + pthread_cond_wait(&vidix_cond, &vidix_mutex);
> + if (!vds_image[0]) break;
> + vo_local.draw_slice(vds_image, vds_stride, vds_w, vds_h, vds_x, vds_y);
> + vds_image[0] = NULL;
> + }
> +}
> +
> uint32_t vidix_draw_slice(uint8_t *image[], int stride[], int w,int h,int x,int y)
> {
> - mp_msg(MSGT_VO,MSGL_WARN, MSGTR_LIBVO_SUB_VIDIX_DummyVidixdrawsliceWasCalled);
> - return -1;
> + pthread_mutex_lock(&vidix_mutex);
> + vds_image[0] = image[0];
> + vds_image[1] = image[1];
> + vds_image[2] = image[2];
> + vds_stride[0] = stride[0];
> + vds_stride[1] = stride[1];
> + vds_stride[2] = stride[2];
> + vds_w = w;
> + vds_h = h;
> + vds_x = x;
> + vds_y = y;
> + pthread_cond_signal(&vidix_cond);
> + pthread_mutex_unlock(&vidix_mutex);
> + return 0;
> }
>
> static uint32_t vidix_draw_image(mp_image_t *mpi){
> @@ -279,7 +314,7 @@
> static void draw_alpha(int x0,int y0, int w,int h, unsigned char* src, unsigned char *srca, int stride)
> {
> uint32_t apitch,bespitch;
> - void *lvo_mem;
> + char *lvo_mem;
> lvo_mem = vidix_mem + vidix_play.offsets[next_frame] + vidix_play.offset.y;
> apitch = vidix_play.dest.pitch.y-1;
> switch(vidix_play.fourcc){
> @@ -557,11 +592,17 @@
> is_422_planes_eq = sstride == dstrides.y;
>
> if(src_format == IMGFMT_YV12 || src_format == IMGFMT_I420 || src_format == IMGFMT_IYUV)
> - vo_server->draw_slice = vidix_draw_slice_420;
> + vo_local.draw_slice = vidix_draw_slice_420;
> else if (src_format == IMGFMT_YVU9 || src_format == IMGFMT_IF09)
> - vo_server->draw_slice = vidix_draw_slice_410;
> - else vo_server->draw_slice = vidix_draw_slice_packed;
> + vo_local.draw_slice = vidix_draw_slice_410;
> + else vo_local.draw_slice = vidix_draw_slice_packed;
> }
> + if (!vidix_thread) {
> + pthread_mutex_lock(&vidix_mutex);
> + pthread_create(&vidix_thread, NULL, vidix_task, NULL);
> + pthread_cond_wait(&vidix_cond, &vidix_mutex);
> + pthread_mutex_unlock(&vidix_mutex);
> + }
> return 0;
> }
Anybody ever cares this?
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
More information about the MPlayer-dev-eng
mailing list