[FFmpeg-devel] [PATCH]opencl: automatically select the fastest opencl device

Sat Nov 23 19:58:40 CET 2013

On date Friday 2013-11-22 12:53:38 -0600, Lenny Wang encoded:
> On Fri, Nov 22, 2013 at 4:42 AM, Lenny Wang <lenny at multicorewareinc.com> wrote:
[...]
> Patch updated, use "-show_opencl" utility to test all opencl devices
> and the fastest one got selected in the end.

> From 0dce93ffee13d03c72c7a071f9292623d8e81b96 Mon Sep 17 00:00:00 2001
> From: Lenny Wang <lwanghpc at gmail.com>
> Date: Fri, 22 Nov 2013 12:43:02 -0600
> Subject: [PATCH] cmdutils/opencl: show available opencl devices and
>  perform tests
> 
> Signed-off-by: Lenny Wang <lwanghpc at gmail.com>
> ---
>  cmdutils.c             | 242 +++++++++++++++++++++++++++++++++++++++++++++++++
>  cmdutils.h             |   6 ++
>  cmdutils_common_opts.h |   1 +
>  libavutil/opencl.c     |   2 +-
>  4 files changed, 250 insertions(+), 1 deletion(-)

Missing doc/cmdutils.texi updates.

> diff --git a/cmdutils.c b/cmdutils.c
> index cbf660f..d99207b 100644
> --- a/cmdutils.c
> +++ b/cmdutils.c
> @@ -48,6 +48,7 @@
>  #include "libavutil/dict.h"
>  #include "libavutil/opt.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/time.h"
>  #include "cmdutils.h"
>  #include "version.h"
>  #if CONFIG_NETWORK
> @@ -987,6 +988,247 @@ int opt_timelimit(void *optctx, const char *opt, const char *arg)
>  }
>  
>  #if CONFIG_OPENCL
> +const char *ocl_bench_source = AV_OPENCL_KERNEL(
> +inline unsigned char clip_uint8(int a)
> +{
> +    if (a & (~0xFF))
> +        return (-a)>>31;
> +    else
> +        return a;
> +}
> +
> +kernel void unsharp_luma(
> +                    global unsigned char *src,
> +                    global unsigned char *dst,
> +                    global int *mask,
> +                    int scalebits,
> +                    int halfscale,
> +                    int width,
> +                    int height)
> +{
> +    int2 threadIdx, blockIdx, globalIdx;

nit: thread_idx, block_idx, global_idx for style consistency, same below

> +    threadIdx.x = get_local_id(0);
> +    threadIdx.y = get_local_id(1);
> +    blockIdx.x = get_group_id(0);
> +    blockIdx.y = get_group_id(1);
> +    globalIdx.x = get_global_id(0);
> +    globalIdx.y = get_global_id(1);

> +
> +    local uchar l[32][32];
> +    local int lc[LU_RADIUS_X*LU_RADIUS_Y];
> +    int indexIx, indexIy, i, j;
> +

> +    for(i = 0; i <= 1; i++) {

style nit: for_(...)
here and below

> +        indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y;
> +        indexIy = indexIy < 0 ? 0 : indexIy;
> +        indexIy = indexIy >= height ? height - 1: indexIy;
> +        for(j = 0; j <= 1; j++) {
> +            indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x;
> +            indexIx = indexIx < 0 ? 0 : indexIx;
> +            indexIx = indexIx >= width ? width - 1: indexIx;
> +            l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy*width + indexIx];
> +        }
> +    }
> +
> +    int indexL = threadIdx.y*16 + threadIdx.x;
> +    if (indexL < LU_RADIUS_X*LU_RADIUS_Y)
> +        lc[indexL] = mask[indexL];
> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    int idx, idy, maskIndex;
> +    int sum = 0;
> +    int steps_x = LU_RADIUS_X/2;
> +    int steps_y = LU_RADIUS_Y/2;
> +
> +    \n#pragma unroll\n
> +    for (i = -steps_y; i <= steps_y; i++) {
> +        idy = 8 + i + threadIdx.y;
> +        \n#pragma unroll\n
> +        for (j = -steps_x; j <= steps_x; j++) {
> +            idx = 8 + j + threadIdx.x;
> +            maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x;
> +            sum += (int)l[idy][idx] * lc[maskIndex];
> +        }
> +    }
> +    int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8];
> +    int res = temp + (((temp - (int)((sum + halfscale) >> scalebits))) >> 16);
> +    if (globalIdx.x < width && globalIdx.y < height)
> +        dst[globalIdx.x + globalIdx.y*width] = clip_uint8(res);
> +}
> +);
> +

> +static void fill_rand_int(int* data, int n)

nit: int *data

> +{
> +    int i;
> +    srand(av_gettime());
> +    for (i = 0; i < n; i++)
> +        data[i] = rand();
> +}
> +
> +static int run_benchmark(AVOpenCLExternalEnv *ext_opencl_env)
> +{
> +    int i, ret = 0;
> +    char build_opts[64];
> +    int steps = 3;
> +    int width = 1920;
> +    int height = 1088;
> +    int memsize = width*height*sizeof(char);
> +    int masksize = sizeof(uint32_t) * (2 * steps + 1) * (2 * steps + 1);
> +    int scalebits = (steps + steps) * 2;
> +    int halfscale = 1 << (scalebits - 1);
> +    int arg = 0;
> +    int64_t start = 0;
> +    size_t kernel_len;
> +
> +    cl_mem cl_mask;
> +    cl_mem cl_inbuf;
> +    cl_mem cl_outbuf;
> +    cl_kernel kernel = NULL;
> +    cl_program program = NULL;

> +    size_t localWorkSize2d[2] = {16, 16};
> +    size_t globalWorkSize2d[2] = {(size_t)width, (size_t)height};

style

> +
> +    char* inbuf = av_malloc(memsize);
> +    int* mask = av_malloc(masksize);

nit: char *inbuf ...; int *mask ...

probably you should check av_malloc() result

> +    fill_rand_int((int*)inbuf, memsize/4);
> +    fill_rand_int(mask, masksize/4);
> +

> +    cl_mask = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_ONLY, masksize, NULL, &ret);
> +    cl_inbuf = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_ONLY, memsize, NULL, &ret);
> +    cl_outbuf = clCreateBuffer(ext_opencl_env->context, CL_MEM_READ_WRITE, memsize, NULL, &ret);
> +    if (ret < 0)
> +        goto end;

if first fail and the last will succeed this will not fail, I don't
know if you can expect that a clCreateBuffer failure is always followed by a failure.

> +
> +    kernel_len = strlen(ocl_bench_source);
> +    program = clCreateProgramWithSource(ext_opencl_env->context, 1, &ocl_bench_source, &kernel_len, &ret);

> +    if (ret != CL_SUCCESS || !program) {
> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create program\n");

... to create benchmark program

> +        goto end;
> +    }

> +    snprintf(build_opts, 64, "-D LU_RADIUS_X=%d -D LU_RADIUS_Y=%d", 2*steps+1, 2*steps+1);

why don't you hardcode these in the kernel source?

> +    ret = clBuildProgram(program, 1, &(ext_opencl_env->device_id), build_opts, NULL, NULL);
> +    if (ret != CL_SUCCESS) {
> +        av_log(NULL, AV_LOG_ERROR, "Unable to build program\n");

... benchmark program

> +        goto end;
> +    }

> +    kernel = clCreateKernel(program, "unsharp_luma", &ret);

nit: "benchmark" or maybe "benchmark_unsharp_luma"

> +    if (ret != CL_SUCCESS) {
> +        goto end;
> +    }
> +    clEnqueueWriteBuffer(ext_opencl_env->command_queue, cl_inbuf, CL_TRUE, 0, memsize, inbuf, 0, NULL, NULL);
> +    clEnqueueWriteBuffer(ext_opencl_env->command_queue, cl_mask, CL_TRUE, 0, masksize, mask, 0, NULL, NULL);
> +
> +    ret = clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_inbuf);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_outbuf);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_mem),&cl_mask);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&scalebits);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&halfscale);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&width);
> +    ret|= clSetKernelArg(kernel,arg++,sizeof(cl_int),&height);
> +    if (ret < 0)
> +        goto end;

I wonder if ff_opencl_set_parameter() should be made public.

> +
> +    // warm up
> +    ret = clEnqueueNDRangeKernel(ext_opencl_env->command_queue, kernel, 2, NULL,
> +                    globalWorkSize2d, localWorkSize2d, 0, NULL, NULL);
> +    clFinish(ext_opencl_env->command_queue);
> +    if (ret < 0)
> +        goto end;
> +
> +    start = av_gettime();

> +    for(i = 0; i < 5; i++)

for_(

you could replace literal 5 with a macro, such as OPENCL_NB_BENCHMARKS

> +	    clEnqueueNDRangeKernel(ext_opencl_env->command_queue, kernel, 2, NULL, globalWorkSize2d,
> +                                localWorkSize2d, 0, NULL, NULL);

> +	clFinish(ext_opencl_env->command_queue);
> +    ret = (int)((av_gettime() - start)/5);

> +end:

> +    if(kernel)

if_(...)

> +        clReleaseKernel(kernel);
> +    if(program)
> +        clReleaseProgram(program);
> +    if(cl_inbuf)
> +        clReleaseMemObject(cl_inbuf);
> +    if(cl_outbuf)
> +        clReleaseMemObject(cl_outbuf);
> +    if(cl_mask)
> +        clReleaseMemObject(cl_mask);
> +    av_free(inbuf);
> +    av_free(mask);
> +    return ret;
> +}
> +
> +static void test_opencl_device(AVOpenCLDeviceList *device_list)
> +{
> +    int i, j;

> +    int platform_idx, device_idx;

nit: fastest_platform_idx, fastest_device_idx;

> +    int time = 0;
> +    int fastest = INT_MAX;
> +    cl_int status;
> +    cl_context_properties cps[3];
> +    AVOpenCLDeviceNode *device_node = NULL;
> +    AVOpenCLExternalEnv *ext_opencl_env = NULL;
> +    ext_opencl_env = av_opencl_alloc_external_env();
> +
> +    for (i = 0; i < device_list->platform_num; i++) {
> +        for (j = 0; j < device_list->platform_node[i]->device_num; j++) {
> +            ext_opencl_env->platform_id = device_list->platform_node[i]->platform_id;
> +            device_node = device_list->platform_node[i]->device_node[j];
> +            ext_opencl_env->device_id = device_node->device_id;
> +            ext_opencl_env->device_type = device_node->device_type;
> +
> +            av_log(NULL, AV_LOG_INFO, "Performing test on OpenCL platform: %s, device: %s\n",
> +                   device_list->platform_node[i]->platform_name, device_node->device_name);
> +
> +            cps[0] = CL_CONTEXT_PLATFORM;
> +            cps[1] = (cl_context_properties)ext_opencl_env->platform_id;
> +            cps[2] = 0;
> +            ext_opencl_env->context = clCreateContextFromType(cps, ext_opencl_env->device_type,
> +                                                                NULL, NULL, &status);
> +            if (status != CL_SUCCESS) {
> +                if(ext_opencl_env->context)

if_(

> +                    clReleaseContext(ext_opencl_env->context);
> +                continue;
> +            }
> +            ext_opencl_env->command_queue = clCreateCommandQueue(ext_opencl_env->context,
> +                                                        ext_opencl_env->device_id, 0, &status);
> +            if (status != CL_SUCCESS) {
> +                if(ext_opencl_env->command_queue)
> +                    clReleaseCommandQueue(ext_opencl_env->command_queue);
> +                if(ext_opencl_env->context)
> +                    clReleaseContext(ext_opencl_env->context);
> +                continue;
> +            }

> +            time = run_benchmark(ext_opencl_env);
> +            if (time > 0) {

> +                av_log(NULL, AV_LOG_INFO, "Device valid, processing time in microseconds: %d\n", time);

Output should be print possibly in a tabular form on stdout, using a
format easy to parse.

I suggest something like:

(for each valid device):
platform_idx=%d device_idx=% bench=%d
...

probably sorted by benchmark time. Then it should be easy for the user
to extract the best device info and reuse it in a script.

> +                if(time < fastest) {
> +                    platform_idx = i;
> +                    device_idx = j;
> +                    fastest = time;
> +                }
> +            } else {
> +                av_log(NULL, AV_LOG_ERROR, "Invalid OpenCL device, test failed\n");
> +            }
> +            clReleaseCommandQueue(ext_opencl_env->command_queue);
> +            clReleaseContext(ext_opencl_env->context);
> +        }
> +    }

> +    av_log(NULL, AV_LOG_WARNING, "Measured fastest OpenCL device: %s, %s\n",
> +            device_list->platform_node[platform_idx]->platform_name,
> +            device_list->platform_node[platform_idx]->device_node[device_idx]->device_name);
> +    av_log(NULL, AV_LOG_WARNING, "use '-opencl_options platform_idx=%d,device_idx=%d' for this device\n",
> +            platform_idx, device_idx);
> +    av_opencl_free_external_env(&ext_opencl_env);
> +}
> +

> +void show_opencl(void *optctx, const char *opt, const char *arg)
> +{
> +    AVOpenCLDeviceList *device_list;
> +    av_opencl_get_device_list(&device_list);
> +    test_opencl_device(device_list);
> +    av_opencl_free_device_list(&device_list);
> +}

You can merge test_opencl_device() here

> +
>  int opt_opencl(void *optctx, const char *opt, const char *arg)
>  {
>      char *key, *value;
> diff --git a/cmdutils.h b/cmdutils.h
> index ce6660a..7d37c6a 100644
> --- a/cmdutils.h
> +++ b/cmdutils.h
> @@ -502,6 +502,12 @@ int show_sample_fmts(void *optctx, const char *opt, const char *arg);
>  int show_colors(void *optctx, const char *opt, const char *arg);
>  
>  /**
> + * Print a listing containing all the opencl platforms and devices with
> + * benchmark scores.
> + */
> +void show_opencl(void *optctx, const char *opt, const char *arg);
> +
> +/**
>   * Return a positive value if a line read from standard input
>   * starts with [yY], otherwise return 0.
>   */
> diff --git a/cmdutils_common_opts.h b/cmdutils_common_opts.h
> index 3e3f0ac..7938b06 100644
> --- a/cmdutils_common_opts.h
> +++ b/cmdutils_common_opts.h
> @@ -22,5 +22,6 @@
>      { "max_alloc"  , HAS_ARG,  {.func_arg = opt_max_alloc},     "set maximum size of a single allocated block", "bytes" },
>      { "cpuflags"   , HAS_ARG | OPT_EXPERT, { .func_arg = opt_cpuflags }, "force specific cpu flags", "flags" },
>  #if CONFIG_OPENCL
> +    { "show_opencl", OPT_EXIT, {.func_arg = show_opencl},       "run OpenCL benchmark across available devices" },

opencl_bench?

>      { "opencl_options", HAS_ARG, {.func_arg = opt_opencl},      "set OpenCL environment options" },
>  #endif
> diff --git a/libavutil/opencl.c b/libavutil/opencl.c
> index ae4c476..8654c25 100644
> --- a/libavutil/opencl.c
> +++ b/libavutil/opencl.c
> @@ -98,7 +98,7 @@ static const AVClass openclutils_class = {
>  
>  static OpenclContext opencl_ctx = {&openclutils_class};
>  
> -static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT};
> +static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU};

Unrelated?

[...]

I'm fine with the overall design if Michael (cmdutils.c maintainer) is
OK with it. As alternative solution, a dedicated tool may do.
-- 
FFmpeg = Fantastic and Fierce Multipurpose Purposeless Efficient Gigant