[FFmpeg-devel] [PATCH]opencl: automatically select the fastest opencl device
Stefano Sabatini
stefasab at gmail.com
Sat Nov 23 19:58:40 CET 2013
On date Friday 2013-11-22 12:53:38 -0600, Lenny Wang encoded:
> On Fri, Nov 22, 2013 at 4:42 AM, Lenny Wang <lenny at multicorewareinc.com> wrote:
[...]
> Patch updated, use "-show_opencl" utility to test all opencl devices
> and the fastest one got selected in the end.
> From 0dce93ffee13d03c72c7a071f9292623d8e81b96 Mon Sep 17 00:00:00 2001
> From: Lenny Wang <lwanghpc at gmail.com>
> Date: Fri, 22 Nov 2013 12:43:02 -0600
> Subject: [PATCH] cmdutils/opencl: show available opencl devices and
> perform tests
>
> Signed-off-by: Lenny Wang <lwanghpc at gmail.com>
> ---
> cmdutils.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++++
> cmdutils.h | 6 ++
> cmdutils_common_opts.h | 1 +
> libavutil/opencl.c | 2 +-
> 4 files changed, 250 insertions(+), 1 deletion(-)
Missing doc/cmdutils.texi updates.
> diff --git a/cmdutils.c b/cmdutils.c
> index cbf660f..d99207b 100644
> --- a/cmdutils.c
> +++ b/cmdutils.c
> @@ -48,6 +48,7 @@
> #include "libavutil/dict.h"
> #include "libavutil/opt.h"
> #include "libavutil/cpu.h"
> +#include "libavutil/time.h"
> #include "cmdutils.h"
> #include "version.h"
> #if CONFIG_NETWORK
> @@ -987,6 +988,247 @@ int opt_timelimit(void *optctx, const char *opt, const char *arg)
> }
>
> #if CONFIG_OPENCL
> +const char *ocl_bench_source = AV_OPENCL_KERNEL(
> +inline unsigned char clip_uint8(int a)
> +{
> + if (a & (~0xFF))
> + return (-a)>>31;
> + else
> + return a;
> +}
> +
> +kernel void unsharp_luma(
> + global unsigned char *src,
> + global unsigned char *dst,
> + global int *mask,
> + int scalebits,
> + int halfscale,
> + int width,
> + int height)
> +{
> + int2 threadIdx, blockIdx, globalIdx;
nit: thread_idx, block_idx, global_idx for style consistency, same below
> + threadIdx.x = get_local_id(0);
> + threadIdx.y = get_local_id(1);
> + blockIdx.x = get_group_id(0);
> + blockIdx.y = get_group_id(1);
> + globalIdx.x = get_global_id(0);
> + globalIdx.y = get_global_id(1);
> +
> + local uchar l[32][32];
> + local int lc[LU_RADIUS_X*LU_RADIUS_Y];
> + int indexIx, indexIy, i, j;
> +
> + for(i = 0; i <= 1; i++) {
style nit: for_(...)
here and below
> + indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y;
> + indexIy = indexIy < 0 ? 0 : indexIy;
> + indexIy = indexIy >= height ? height - 1: indexIy;
> + for(j = 0; j <= 1; j++) {
> + indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x;
> + indexIx = indexIx < 0 ? 0 : indexIx;
> + indexIx = indexIx >= width ? width - 1: indexIx;
> + l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy*width + indexIx];
> + }
> + }
> +
> + int indexL = threadIdx.y*16 + threadIdx.x;
> + if (indexL < LU_RADIUS_X*LU_RADIUS_Y)
> + lc[indexL] = mask[indexL];
> + barrier(CLK_LOCAL_MEM_FENCE);
> +
> + int idx, idy, maskIndex;
> + int sum = 0;
> + int steps_x = LU_RADIUS_X/2;
> + int steps_y = LU_RADIUS_Y/2;
> +
> + \n#pragma unroll\n
> + for (i = -steps_y; i <= steps_y; i++) {
> + idy = 8 + i + threadIdx.y;
> + \n#pragma unroll\n
> + for (j = -steps_x; j <= steps_x; j++) {
> + idx = 8 + j + threadIdx.x;
> + maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x;
> + sum += (int)l[idy][idx] * lc[maskIndex];
> + }
> + }
> + int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8];
> + int res = temp + (((temp - (int)((sum + halfscale) >> scalebits))) >> 16);
> + if (globalIdx.x < width && globalIdx.y < height)
> + dst[globalIdx.x + globalIdx.y*width] = clip_uint8(res);
> +}
> +);
> +
> +static void fill_rand_int(int* data, int n)
nit: int *data
> +{
> + int i;
> + srand(av_gettime());
> + for (i = 0; i < n; i++)
> + data[i] = rand();
> +}
> +
> +static int run_benchmark(AVOpenCLExternalEnv *ext_opencl_env)
> +{
> + int i, ret = 0;
> + char build_opts[64];
> + int steps = 3;
> + int width = 1920;
> + int height = 1088;
> + int memsize = width*height*sizeof(char);
> + int masksize = sizeof(uint32_t) * (2 * steps + 1) * (2 * steps + 1);
> + int scalebits = (steps + steps) * 2;
> + int halfscale = 1 << (scalebits - 1);
> + int arg = 0;
> + int64_t start = 0;
> + size_t kernel_len;
> +
> + cl_mem cl_mask;
> + cl_mem cl_inbuf;
> + cl_mem cl_outbuf;
> + cl_kernel kernel = NULL;
> + cl_program program = NULL;
> + size_t localWorkSize2d[2] = {16, 16};
> + size_t globalWorkSize2d[2] = {(size_t)width, (size_t)height};
style
> +
> + char* inbuf = av_malloc(memsize);
> + int* mask = av_malloc(masksize);
nit: char *inbuf ...; int *mask ...
probably you should check av_malloc() result
> + fill_rand_int((int*)inbuf, memsize/4);
> + fill_rand_int(mask, masksize/4);
> +
> + cl_mask = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_ONLY, masksize, NULL, &ret);
> + cl_inbuf = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_ONLY, memsize, NULL, &ret);
> + cl_outbuf = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_WRITE, memsize, NULL, &ret);
> + if (ret < 0)
> + goto end;
if first fail and the last will succeed this will not fail, I don't
know if you can expect that a clCreateBuffer failure is always followed by a failure.
> +
> + kernel_len = strlen(ocl_bench_source);
> + program = clCreateProgramWithSource(ext_opencl_env->context, 1, &ocl_bench_source, &kernel_len, &ret);
> + if (ret != CL_SUCCESS || !program) {
> + av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create program\n");
... to create benchmark program
> + goto end;
> + }
> + snprintf(build_opts, 64, "-D LU_RADIUS_X=%d -D LU_RADIUS_Y=%d", 2*steps+1, 2*steps+1);
why don't you hardcode these in the kernel source?
> + ret = clBuildProgram(program, 1, &(ext_opencl_env->device_id), build_opts, NULL, NULL);
> + if (ret != CL_SUCCESS) {
> + av_log(NULL, AV_LOG_ERROR, "Unable to build program\n");
... benchmark program
> + goto end;
> + }
> + kernel = clCreateKernel(program, "unsharp_luma", &ret);
nit: "benchmark" or maybe "benchmark_unsharp_luma"
> + if (ret != CL_SUCCESS) {
> + goto end;
> + }
> + clEnqueueWriteBuffer(ext_opencl_env->command_queue, cl_inbuf, CL_TRUE, 0, memsize, inbuf, 0, NULL, NULL);
> + clEnqueueWriteBuffer(ext_opencl_env->command_queue, cl_mask, CL_TRUE, 0, masksize, mask, 0, NULL, NULL);
> +
> + ret = clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_inbuf);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_outbuf);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_mask);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&scalebits);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&halfscale);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&width);
> + ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&height);
> + if (ret < 0)
> + goto end;
I wonder if ff_opencl_set_parameter() should be made public.
> +
> + // warm up
> + ret = clEnqueueNDRangeKernel(ext_opencl_env->command_queue, kernel, 2, NULL,
> + globalWorkSize2d, localWorkSize2d, 0, NULL, NULL);
> + clFinish(ext_opencl_env->command_queue);
> + if (ret < 0)
> + goto end;
> +
> + start = av_gettime();
> + for(i = 0; i < 5; i++)
for_(
you could replace literal 5 with a macro, such as OPENCL_NB_BENCHMARKS
> + clEnqueueNDRangeKernel(ext_opencl_env->command_queue, kernel, 2, NULL, globalWorkSize2d,
> + localWorkSize2d, 0, NULL, NULL);
> + clFinish(ext_opencl_env->command_queue);
> + ret = (int)((av_gettime() - start)/5);
> +end:
> + if(kernel)
if_(...)
> + clReleaseKernel(kernel);
> + if(program)
> + clReleaseProgram(program);
> + if(cl_inbuf)
> + clReleaseMemObject(cl_inbuf);
> + if(cl_outbuf)
> + clReleaseMemObject(cl_outbuf);
> + if(cl_mask)
> + clReleaseMemObject(cl_mask);
> + av_free(inbuf);
> + av_free(mask);
> + return ret;
> +}
> +
> +static void test_opencl_device(AVOpenCLDeviceList *device_list)
> +{
> + int i, j;
> + int platform_idx, device_idx;
nit: fastest_platform_idx, fastest_device_idx;
> + int time = 0;
> + int fastest = INT_MAX;
> + cl_int status;
> + cl_context_properties cps[3];
> + AVOpenCLDeviceNode *device_node = NULL;
> + AVOpenCLExternalEnv *ext_opencl_env = NULL;
> + ext_opencl_env = av_opencl_alloc_external_env();
> +
> + for (i = 0; i < device_list->platform_num; i++) {
> + for (j = 0; j < device_list->platform_node[i]->device_num; j++) {
> + ext_opencl_env->platform_id = device_list->platform_node[i]->platform_id;
> + device_node = device_list->platform_node[i]->device_node[j];
> + ext_opencl_env->device_id = device_node->device_id;
> + ext_opencl_env->device_type = device_node->device_type;
> +
> + av_log(NULL, AV_LOG_INFO, "Performing test on OpenCL platform: %s, device: %s\n",
> + device_list->platform_node[i]->platform_name, device_node->device_name);
> +
> + cps[0] = CL_CONTEXT_PLATFORM;
> + cps[1] = (cl_context_properties)ext_opencl_env->platform_id;
> + cps[2] = 0;
> + ext_opencl_env->context = clCreateContextFromType(cps, ext_opencl_env->device_type,
> + NULL, NULL, &status);
> + if (status != CL_SUCCESS) {
> + if(ext_opencl_env->context)
if_(
> + clReleaseContext(ext_opencl_env->context);
> + continue;
> + }
> + ext_opencl_env->command_queue = clCreateCommandQueue(ext_opencl_env->context,
> + ext_opencl_env->device_id, 0, &status);
> + if (status != CL_SUCCESS) {
> + if(ext_opencl_env->command_queue)
> + clReleaseCommandQueue(ext_opencl_env->command_queue);
> + if(ext_opencl_env->context)
> + clReleaseContext(ext_opencl_env->context);
> + continue;
> + }
> + time = run_benchmark(ext_opencl_env);
> + if (time > 0) {
> + av_log(NULL, AV_LOG_INFO, "Device valid, processing time in microseconds: %d\n", time);
Output should be print possibly in a tabular form on stdout, using a
format easy to parse.
I suggest something like:
(for each valid device):
platform_idx=%d device_idx=% bench=%d
...
probably sorted by benchmark time. Then it should be easy for the user
to extract the best device info and reuse it in a script.
> + if(time < fastest) {
> + platform_idx = i;
> + device_idx = j;
> + fastest = time;
> + }
> + } else {
> + av_log(NULL, AV_LOG_ERROR, "Invalid OpenCL device, test failed\n");
> + }
> + clReleaseCommandQueue(ext_opencl_env->command_queue);
> + clReleaseContext(ext_opencl_env->context);
> + }
> + }
> + av_log(NULL, AV_LOG_WARNING, "Measured fastest OpenCL device: %s, %s\n",
> + device_list->platform_node[platform_idx]->platform_name,
> + device_list->platform_node[platform_idx]->device_node[device_idx]->device_name);
> + av_log(NULL, AV_LOG_WARNING, "use '-opencl_options platform_idx=%d,device_idx=%d' for this device\n",
> + platform_idx, device_idx);
> + av_opencl_free_external_env(&ext_opencl_env);
> +}
> +
> +void show_opencl(void *optctx, const char *opt, const char *arg)
> +{
> + AVOpenCLDeviceList *device_list;
> + av_opencl_get_device_list(&device_list);
> + test_opencl_device(device_list);
> + av_opencl_free_device_list(&device_list);
> +}
You can merge test_opencl_device() here
> +
> int opt_opencl(void *optctx, const char *opt, const char *arg)
> {
> char *key, *value;
> diff --git a/cmdutils.h b/cmdutils.h
> index ce6660a..7d37c6a 100644
> --- a/cmdutils.h
> +++ b/cmdutils.h
> @@ -502,6 +502,12 @@ int show_sample_fmts(void *optctx, const char *opt, const char *arg);
> int show_colors(void *optctx, const char *opt, const char *arg);
>
> /**
> + * Print a listing containing all the opencl platforms and devices with
> + * benchmark scores.
> + */
> +void show_opencl(void *optctx, const char *opt, const char *arg);
> +
> +/**
> * Return a positive value if a line read from standard input
> * starts with [yY], otherwise return 0.
> */
> diff --git a/cmdutils_common_opts.h b/cmdutils_common_opts.h
> index 3e3f0ac..7938b06 100644
> --- a/cmdutils_common_opts.h
> +++ b/cmdutils_common_opts.h
> @@ -22,5 +22,6 @@
> { "max_alloc" , HAS_ARG, {.func_arg = opt_max_alloc}, "set maximum size of a single allocated block", "bytes" },
> { "cpuflags" , HAS_ARG | OPT_EXPERT, { .func_arg = opt_cpuflags }, "force specific cpu flags", "flags" },
> #if CONFIG_OPENCL
> + { "show_opencl", OPT_EXIT, {.func_arg = show_opencl}, "run OpenCL benchmark across available devices" },
opencl_bench?
> { "opencl_options", HAS_ARG, {.func_arg = opt_opencl}, "set OpenCL environment options" },
> #endif
> diff --git a/libavutil/opencl.c b/libavutil/opencl.c
> index ae4c476..8654c25 100644
> --- a/libavutil/opencl.c
> +++ b/libavutil/opencl.c
> @@ -98,7 +98,7 @@ static const AVClass openclutils_class = {
>
> static OpenclContext opencl_ctx = {&openclutils_class};
>
> -static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT};
> +static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU};
Unrelated?
[...]
I'm fine with the overall design if Michael (cmdutils.c maintainer) is
OK with it. As alternative solution, a dedicated tool may do.
--
FFmpeg = Fantastic and Fierce Multipurpose Purposeless Efficient Gigant
More information about the ffmpeg-devel
mailing list