[FFmpeg-devel] [PATCH 1/2] libavutil/libavfilter: opencl wrapper based on comments on 20130326

Stefano Sabatini stefasab at gmail.com
Wed Mar 27 00:45:33 CET 2013


On date Tuesday 2013-03-26 18:55:05 +0800, Wei Gao encoded:
> 

> From f91df6a8315a1b7bdc7b69517831fc745fcbd4fd Mon Sep 17 00:00:00 2001
> From: highgod0401 <highgod0401 at gmail.com>
> Date: Tue, 26 Mar 2013 18:43:00 +0800
> Subject: [PATCH 1/2] opencl wrapper based on comments on 20130326
> 
> ---
>  configure          |   4 +
>  libavutil/Makefile |   3 +
>  libavutil/opencl.c | 653 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  libavutil/opencl.h | 219 ++++++++++++++++++
>  4 files changed, 879 insertions(+)
>  create mode 100644 libavutil/opencl.c
>  create mode 100644 libavutil/opencl.h
> 
> diff --git a/configure b/configure
> index 8443db4..9c42a85 100755
> --- a/configure
> +++ b/configure
> @@ -233,6 +233,7 @@ External library support:
>    --enable-libxvid         enable Xvid encoding via xvidcore,
>                             native MPEG-4/Xvid encoder exists [no]
>    --enable-openal          enable OpenAL 1.1 capture support [no]
> +  --enable-opencl          enable OpenCL code
>    --enable-openssl         enable openssl [no]
>    --enable-x11grab         enable X11 grabbing [no]
>    --enable-zlib            enable zlib [autodetect]
> @@ -1178,6 +1179,7 @@ EXTERNAL_LIBRARY_LIST="
>      libxavs
>      libxvid
>      openal
> +    opencl
>      openssl
>      x11grab
>      zlib
> @@ -3982,6 +3984,7 @@ enabled openal     && { { for al_libs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32
>                          die "ERROR: openal not found"; } &&
>                        { check_cpp_condition "AL/al.h" "defined(AL_VERSION_1_1)" ||
>                          die "ERROR: openal must be installed and version must be 1.1 or compatible"; }
> +enabled opencl     && require2 opencl CL/cl.h clEnqueueNDRangeKernel -lOpenCL
>  enabled openssl    && { check_lib openssl/ssl.h SSL_library_init -lssl -lcrypto ||
>                          check_lib openssl/ssl.h SSL_library_init -lssl32 -leay32 ||
>                          check_lib openssl/ssl.h SSL_library_init -lssl -lcrypto -lws2_32 -lgdi32 ||
> @@ -4350,6 +4353,7 @@ echo "network support           ${network-no}"
>  echo "threading support         ${thread_type-no}"
>  echo "safe bitstream reader     ${safe_bitstream_reader-no}"
>  echo "SDL support               ${sdl-no}"
> +echo "opencl enabled            ${opencl-no}"
>  echo "texi2html enabled         ${texi2html-no}"
>  echo "perl enabled              ${perl-no}"
>  echo "pod2man enabled           ${pod2man-no}"
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 103ce5e..6375e10 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -52,6 +52,8 @@ HEADERS = adler32.h                                                     \
>  
>  HEADERS-$(CONFIG_LZO)                   += lzo.h
>  
> +HEADERS-$(CONFIG_OPENCL)                += opencl.h
> +
>  ARCH_HEADERS = bswap.h                                                  \
>                 intmath.h                                                \
>                 intreadwrite.h                                           \
> @@ -115,6 +117,7 @@ SKIPHEADERS-$(HAVE_MACHINE_RW_BARRIER)          += atomic_suncc.h
>  SKIPHEADERS-$(HAVE_MEMORYBARRIER)               += atomic_win32.h
>  SKIPHEADERS-$(HAVE_SYNC_VAL_COMPARE_AND_SWAP)   += atomic_gcc.h
>  
> +OBJS-$(CONFIG_OPENCL)                   += opencl.o
>  TESTPROGS = adler32                                                     \
>              aes                                                         \
>              atomic                                                      \
> diff --git a/libavutil/opencl.c b/libavutil/opencl.c
> new file mode 100644
> index 0000000..929aae7
> --- /dev/null
> +++ b/libavutil/opencl.c
> @@ -0,0 +1,653 @@
> +/*
> + * Copyright (C) 2012 Peng Gao <peng at multicorewareinc.com>
> + * Copyright (C) 2012 Li   Cao <li at multicorewareinc.com>
> + * Copyright (C) 2012 Wei  Gao <weigao at multicorewareinc.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "opencl.h"
> +#include "avstring.h"
> +#include "log.h"
> +#include "avassert.h"
> +
> +#define MAX_KERNEL_NAME_LEN  64
> +#define MAX_KERNEL_NUM 200
> +
> +typedef struct GPUEnv {
> +    cl_platform_id platform;
> +    cl_device_type device_type;
> +    cl_context context;
> +    cl_device_id *device_ids;
> +    cl_device_id device_id;
> +    cl_command_queue command_queue;
> +    cl_program program;

> +    char kernel_names[MAX_KERNEL_NUM][MAX_KERNEL_NAME_LEN+1];
> +    av_opencl_kernel_function kernel_functions[MAX_KERNEL_NUM];
> +    const char *kernel_code[MAX_KERNEL_NUM];

Are these related to kernel *functions*, right?

In this case I think:
   char                      kernel_code_name     [MAX_KERNEL_FUNCTIONS_NUM][MAX_KERNEL_FUNCTION_NAME_LEN+1];
   const char               *kernel_code          [MAX_KERNEL_FUNCTIONS_NUM];
   av_opencl_kernel_function kernel_functions     [MAX_KERNEL_FUNCTIONS_NUM];

may be more clear.

> +    int kernel_count;

This also seems related to kernel function names, so I guess:

int kernel_function_count;

could be better

> +    int runtime_kernel_count;
> +    int is_user_created; // 1: the opencl env is created by user and use AVOpenCLExternalInfo to pass to ffmpeg ,0:created by opencl wrapper
> +    uint8_t *temp_buffer;
> +    int temp_buffer_size;
> +} GPUEnv;
> +
> +typedef struct OpenclErrorMsg {
> +    int err_code;
> +    const char *err_str;
> +} OpenclErrorMsg;
> +
> +static OpenclErrorMsg opencl_err_msg[] = {
> +        {CL_DEVICE_NOT_FOUND,                               "DEVICE NOT FOUND"},
> +        {CL_DEVICE_NOT_AVAILABLE,                           "DEVICE NOT AVAILABLE"},
> +        {CL_COMPILER_NOT_AVAILABLE,                         "COMPILER NOT AVAILABLE"},
> +        {CL_MEM_OBJECT_ALLOCATION_FAILURE,                  "MEM OBJECT ALLOCATION FAILURE"},
> +        {CL_OUT_OF_RESOURCES,                               "OUT OF RESOURCES"},
> +        {CL_OUT_OF_HOST_MEMORY,                             "OUT OF HOST MEMORY"},
> +        {CL_PROFILING_INFO_NOT_AVAILABLE,                   "PROFILING INFO NOT AVAILABLE"},
> +        {CL_MEM_COPY_OVERLAP,                               "MEM COPY OVERLAP"},
> +        {CL_IMAGE_FORMAT_MISMATCH,                          "IMAGE FORMAT MISMATCH"},
> +        {CL_IMAGE_FORMAT_NOT_SUPPORTED,                     "IMAGE FORMAT NOT_SUPPORTED"},
> +        {CL_BUILD_PROGRAM_FAILURE,                          "BUILD PROGRAM FAILURE"},
> +        {CL_MAP_FAILURE,                                    "MAP FAILURE"},
> +        {CL_MISALIGNED_SUB_BUFFER_OFFSET,                   "MISALIGNED SUB BUFFER OFFSET"},
> +        {CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST,      "EXEC STATUS ERROR FOR EVENTS IN WAIT LIST"},
> +        {CL_COMPILE_PROGRAM_FAILURE,                        "COMPILE PROGRAM FAILURE"},
> +        {CL_LINKER_NOT_AVAILABLE,                           "LINKER NOT AVAILABLE"},
> +        {CL_LINK_PROGRAM_FAILURE,                           "LINK PROGRAM FAILURE"},
> +        {CL_DEVICE_PARTITION_FAILED,                        "DEVICE PARTITION FAILED"},
> +        {CL_KERNEL_ARG_INFO_NOT_AVAILABLE,                  "KERNEL ARG INFO NOT AVAILABLE"},
> +        {CL_INVALID_VALUE,                                  "INVALID VALUE"},
> +        {CL_INVALID_DEVICE_TYPE,                            "INVALID DEVICE TYPE"},
> +        {CL_INVALID_PLATFORM,                               "INVALID PLATFORM"},
> +        {CL_INVALID_DEVICE,                                 "INVALID DEVICE"},
> +        {CL_INVALID_CONTEXT,                                "INVALID CONTEXT"},
> +        {CL_INVALID_QUEUE_PROPERTIES,                       "INVALID QUEUE PROPERTIES"},
> +        {CL_INVALID_COMMAND_QUEUE,                          "INVALID COMMAND QUEUE"},
> +        {CL_INVALID_HOST_PTR,                               "INVALID HOST PTR"},
> +        {CL_INVALID_MEM_OBJECT,                             "INVALID MEM OBJECT"},
> +        {CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,                "INVALID IMAGE FORMAT DESCRIPTOR"},
> +        {CL_INVALID_IMAGE_SIZE,                             "INVALID IMAGE SIZE"},
> +        {CL_INVALID_SAMPLER,                                "INVALID SAMPLER"},
> +        {CL_INVALID_BINARY,                                 "INVALID BINARY"},
> +        {CL_INVALID_BUILD_OPTIONS,                          "INVALID BUILD OPTIONS"},
> +        {CL_INVALID_PROGRAM,                                "INVALID PROGRAM"},
> +        {CL_INVALID_PROGRAM_EXECUTABLE,                     "INVALID PROGRAM EXECUTABLE"},
> +        {CL_INVALID_KERNEL_NAME,                            "INVALID KERNEL NAME"},
> +        {CL_INVALID_KERNEL_DEFINITION,                      "INVALID KERNEL DEFINITION"},
> +        {CL_INVALID_KERNEL,                                 "INVALID KERNEL"},
> +        {CL_INVALID_ARG_INDEX,                              "INVALID ARG INDEX"},
> +        {CL_INVALID_ARG_VALUE,                              "INVALID ARG VALUE"},
> +        {CL_INVALID_ARG_SIZE,                               "INVALID ARG_SIZE"},
> +        {CL_INVALID_KERNEL_ARGS,                            "INVALID KERNEL ARGS"},
> +        {CL_INVALID_WORK_DIMENSION,                         "INVALID WORK DIMENSION"},
> +        {CL_INVALID_WORK_GROUP_SIZE,                        "INVALID WORK GROUP SIZE"},
> +        {CL_INVALID_WORK_ITEM_SIZE,                         "INVALID WORK ITEM SIZE"},
> +        {CL_INVALID_GLOBAL_OFFSET,                          "INVALID GLOBAL OFFSET"},
> +        {CL_INVALID_EVENT_WAIT_LIST,                        "INVALID EVENT WAIT LIST"},
> +        {CL_INVALID_EVENT,                                  "INVALID EVENT"},
> +        {CL_INVALID_OPERATION,                              "INVALID OPERATION"},
> +        {CL_INVALID_GL_OBJECT,                              "INVALID GL OBJECT"},
> +        {CL_INVALID_BUFFER_SIZE,                            "INVALID BUFFER SIZE"},
> +        {CL_INVALID_MIP_LEVEL,                              "INVALID MIP LEVEL"},
> +        {CL_INVALID_GLOBAL_WORK_SIZE,                       "INVALID GLOBAL WORK SIZE"},
> +        {CL_INVALID_PROPERTY,                               "INVALID PROPERTY"},
> +        {CL_INVALID_IMAGE_DESCRIPTOR,                       "INVALID IMAGE DESCRIPTOR"},
> +        {CL_INVALID_COMPILER_OPTIONS,                       "INVALID COMPILER OPTIONS"},
> +        {CL_INVALID_LINKER_OPTIONS,                         "INVALID LINKER OPTIONS"},
> +        {CL_INVALID_DEVICE_PARTITION_COUNT,                 "INVALID DEVICE PARTITION COUNT"},
> +};
> +
> +typedef struct OpenclUtils {
> +    const AVClass *class;
> +    int log_offset;
> +    void *log_ctx;
> +} OpenclUtils;
> +
> +static const AVClass openclutils_class = {"OPENCLUTILS", av_default_item_name,
> +                                                   NULL, LIBAVUTIL_VERSION_INT,
> +                                                   offsetof(OpenclUtils, log_offset),
> +                                                   offsetof(OpenclUtils, log_ctx)};
> +static OpenclUtils openclutils = {&openclutils_class};
> +static GPUEnv gpu_env;

> +static int isinited;

nit:
is_inited or opencl_is_inited

> +
> +int av_opencl_register_kernel(const char *kernel_name, const char *kernel_code)
> +{
> +    if (gpu_env.kernel_count < MAX_KERNEL_NUM) {
> +        if (strlen(kernel_name) <= MAX_KERNEL_NAME_LEN) {
> +            gpu_env.kernel_code[gpu_env.kernel_count] = kernel_code;
> +            av_strlcpy(gpu_env.kernel_names[gpu_env.kernel_count], kernel_name, MAX_KERNEL_NAME_LEN+1);
> +            gpu_env.kernel_count++;
> +        } else {
> +            av_log(&openclutils, AV_LOG_ERROR, "Registered kernel name %s is too long\n", kernel_name);
> +            return AVERROR(EINVAL);
> +        }
> +    } else {
> +        av_log(&openclutils, AV_LOG_ERROR,
> +         "Could not register kernel with name '%s', maximum number of registered kernels %d already reached\n",
> +         kernel_name, MAX_KERNEL_NUM);
> +        return AVERROR(EINVAL);
> +    }

Nit: you could simplify the structure and save one or two indent levels:

if (gpu_env.kernel_count >= MAX_KERNEL_NUM) {
   ...
   return err;
}   

if (strlen(...) >= ...)
   ...
   return err;
}

register kernel;
}

Uhm so basically this function register a kernel name and code in the
global environment, and this will be compiled when doing
av_opencl_init(), right?

More about the overall design later.

> +    return 0;
> +}
> +static const char *opencl_errstr(int status)
> +{
> +    int i;
> +    for (i = 0; i < sizeof(opencl_err_msg); i++) {
> +        if (opencl_err_msg[i].err_code == status)
> +            return opencl_err_msg[i].err_str;
> +    }
> +    return "unknown error";
> +}
> +

> +int av_opencl_create_kernel(const char *kernel_name, AVOpenCLKernelEnv *env)
> +{
> +    int status;
> +    if (!env->kernel) {
> +        env->kernel        = clCreateKernel(gpu_env.program, kernel_name, &status);
> +        env->context       = gpu_env.context;
> +        env->command_queue = gpu_env.command_queue;
> +        env->program       = gpu_env.program;
> +        av_strlcpy(env->kernel_name, kernel_name, AV_OPENCL_MAX_KERNEL_NAME_SIZE);

You could abort here in case the kernel name is too long.

> +        if (status != CL_SUCCESS) {
> +            av_log(&openclutils, AV_LOG_ERROR, "Could not create OpenCL kernel: %s\n", opencl_errstr(status));
> +            return AVERROR_EXTERNAL;
> +        }
> +    }
> +    return 0;
> +}

You should abort in case the kernel was already created.

Also maybe:
int av_opencl_create_kernel(AVOpenCLKernelEnv *env, const char *kernel_name);

could make more sense.

Also about the param "kernel_name", this is rather the name of the
function defined in a kernel if I understand correctly, so maybe:
"kernel_function_name" or "kernel_entry_point" could be less
confusing.

> +

> +void av_opencl_release_kernel(AVOpenCLKernelEnv *env)
> +{
> +    int status;
> +    if (!env->kernel)
> +        return;
> +    status = clReleaseKernel(env->kernel);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not release kernel: %s\n", opencl_errstr(status));
> +    }
> +}
> +

> +static int init_opencl_env(GPUEnv *gpu_env, AVOpenCLExternalInfo *ext_opencl_info)
> +{
> +    size_t device_length;
> +    cl_int status;
> +    cl_uint num_platforms, num_devices;
> +    cl_platform_id *platform_ids = NULL;
> +    cl_context_properties cps[3];
> +    char platform_name[100];
> +    int i;
> +    int ret = 0;
> +    cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT};
> +    if (ext_opencl_info) {
> +        if (gpu_env->is_user_created)
> +            return 0;
> +        gpu_env->platform        = ext_opencl_info->platform;
> +        gpu_env->is_user_created = 1;
> +        gpu_env->command_queue   = ext_opencl_info->command_queue;
> +        gpu_env->context         = ext_opencl_info->context;
> +        gpu_env->device_ids      = ext_opencl_info->device_ids;
> +        gpu_env->device_id       = ext_opencl_info->device_id;
> +        gpu_env->device_type     = ext_opencl_info->device_type;
> +    } else {
> +        if (!gpu_env->is_user_created) {
> +            status = clGetPlatformIDs(0, NULL, &num_platforms);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL platform ids: %s\n", opencl_errstr(status));
> +                return AVERROR_EXTERNAL;
> +            }
> +            if (num_platforms > 0) {
> +                platform_ids = av_mallocz(num_platforms * sizeof(cl_platform_id));
> +                if (!platform_ids) {
> +                    ret = AVERROR(ENOMEM);
> +                    goto end;
> +                }
> +                status = clGetPlatformIDs(num_platforms, platform_ids, NULL);
> +                if (status != CL_SUCCESS) {
> +                    av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL platform ids: %s\n", opencl_errstr(status));
> +                    ret = AVERROR_EXTERNAL;
> +                    goto end;
> +                }
> +                for (i = 0; i < num_platforms; i++) {
> +                    status = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR,
> +                                               sizeof(platform_name), platform_name,
> +                                               NULL);
> +
> +                    if (status != CL_SUCCESS) {
> +                        av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL platform info: %s\n", opencl_errstr(status));
> +                        ret = AVERROR_EXTERNAL;
> +                        goto end;
> +                    }
> +                    gpu_env->platform = platform_ids[i];
> +                    status = clGetDeviceIDs(gpu_env->platform, CL_DEVICE_TYPE_GPU,
> +                                            0, NULL, &num_devices);
> +                    if (status != CL_SUCCESS) {
> +                        av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL device number:%s\n", opencl_errstr(status));
> +                        ret = AVERROR_EXTERNAL;
> +                        goto end;
> +                    }
> +                    if (num_devices == 0) {
> +                        //find CPU device
> +                        status = clGetDeviceIDs(gpu_env->platform, CL_DEVICE_TYPE_CPU,
> +                                             0, NULL, &num_devices);
> +                    }
> +                    if (status != CL_SUCCESS) {
> +                        av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL device ids: %s\n", opencl_errstr(status));
> +                        ret = AVERROR_EXTERNAL;
> +                        goto end;
> +                    }
> +                    if (num_devices)
> +                       break;
> +
> +                }
> +            }
> +            if (!gpu_env->platform) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL platforms\n");
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +
> +           /*
> +                 * Use available platform.
> +                 */
> +            av_log(&openclutils, AV_LOG_VERBOSE, "Platform Name: %s\n", platform_name);
> +            cps[0] = CL_CONTEXT_PLATFORM;
> +            cps[1] = (cl_context_properties)gpu_env->platform;
> +            cps[2] = 0;
> +            /* Check for GPU. */
> +
> +            for (i = 0; i < sizeof(device_type); i++) {
> +                gpu_env->device_type = device_type[i];
> +                gpu_env->context     = clCreateContextFromType(cps, gpu_env->device_type,
> +                                                               NULL, NULL, &status);
> +                if (status != CL_SUCCESS) {
> +                    av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL context from device type: %s\n", opencl_errstr(status));
> +                    ret = AVERROR_EXTERNAL;
> +                    goto end;
> +                }
> +                if (gpu_env->context)
> +                    break;
> +            }
> +            if (!gpu_env->context) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL context from device type\n");
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +            /* Detect OpenCL devices. */
> +            /* First, get the size of device list data */
> +            status = clGetContextInfo(gpu_env->context, CL_CONTEXT_DEVICES,
> +                                      0, NULL, &device_length);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL device length: %s\n", opencl_errstr(status));
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +            if (device_length == 0) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL device length\n");
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +            /* Now allocate memory for device list based on the size we got earlier */
> +            gpu_env->device_ids = av_mallocz(device_length);
> +            if (!gpu_env->device_ids) {
> +                ret = AVERROR(ENOMEM);
> +                goto end;
> +            }
> +            /* Now, get the device list data */
> +            status = clGetContextInfo(gpu_env->context, CL_CONTEXT_DEVICES, device_length,
> +                                      gpu_env->device_ids, NULL);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not get OpenCL context info: %s\n", opencl_errstr(status));
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +            /* Create OpenCL command queue. */
> +            gpu_env->command_queue = clCreateCommandQueue(gpu_env->context, gpu_env->device_ids[0],
> +                                                          0, &status);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not create OpenCL command queue: %s\n", opencl_errstr(status));
> +                ret = AVERROR_EXTERNAL;
> +                goto end;
> +            }
> +        }
> +    }
> +end:
> +    av_free(platform_ids);
> +    return ret;
> +}
> +
> +
> +int av_opencl_register_kernel_function(const char *kernel_name, av_opencl_kernel_function function)
> +{
> +    int i;
> +    for (i = 0; i < gpu_env.kernel_count; i++) {
> +        if (av_strcasecmp(kernel_name, gpu_env.kernel_names[i]) == 0) {
> +            gpu_env.kernel_functions[i] = function;
> +            gpu_env.runtime_kernel_count++;
> +            return 0;
> +        }
> +    }
> +    av_log(&openclutils, AV_LOG_ERROR, "Could not find a kernel with name '%s', cannot register function\n", kernel_name);
> +    return AVERROR(EINVAL);
> +
> +}
> +
> +static int compile_kernel_file(GPUEnv *gpu_env, const char *build_option)
> +{
> +    cl_int status;

cl_int / int status is used inconsistently in the code, always use
cl_int or int for status.

> +    size_t kernel_code_length = 0;
> +    char *source_str = NULL;

nit: for consistency/readability I suggest
char  *source_str
size_t source_str_len;

> +    char *temp;

> +    int ret = 0;
> +    int i;

nit: you can merge the declarations
int i, ret;

> +
> +    if (gpu_env->program)
> +        return ret;
> +
> +    for (i = 0; i < gpu_env->kernel_count; i++) {
> +        kernel_code_length += strlen(gpu_env->kernel_code[i]);
> +    }
> +    source_str = av_mallocz(kernel_code_length + 1);
> +    if (!source_str) {
> +        return AVERROR(ENOMEM);
> +    }

> +    temp = source_str;
> +    for (i = 0; i < gpu_env->kernel_count; i++) {
> +        memcpy(temp, gpu_env->kernel_code[i], strlen(gpu_env->kernel_code[i]));
> +        temp += strlen(gpu_env->kernel_code[i]);
> +    }
> +    /* create a CL program using the kernel source */
> +    gpu_env->program = clCreateProgramWithSource(gpu_env->context, 1, (const char **)(&source_str),
> +                                                 &kernel_code_length, &status);
> +    if(status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not create OpenCL program with source code: %s\n",
> +               opencl_errstr(status));
> +        ret = AVERROR_EXTERNAL;
> +        goto end;
> +    }
> +    if (!gpu_env->program) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Created program is NULL\n");
> +        ret = AVERROR_EXTERNAL;
> +        goto end;
> +    }
> +    /* create a cl program executable for all the devices specified */
> +    if (!gpu_env->is_user_created)
> +        status = clBuildProgram(gpu_env->program, 1, gpu_env->device_ids,
> +                                build_option, NULL, NULL);
> +    else
> +        status = clBuildProgram(gpu_env->program, 1, &(gpu_env->device_id),
> +                                 build_option, NULL, NULL);
> +
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not compile OpenCL kernel: %s\n", opencl_errstr(status));
> +        ret = AVERROR_EXTERNAL;
> +        goto end;
> +    }
> +end:
> +    av_free(source_str);
> +    return ret;
> +}
> +
> +int av_opencl_run_kernel(const char *kernel_name, void **userdata)
> +{
> +    av_opencl_kernel_function function = NULL;
> +    int i;

> +    for (i = 0; i < gpu_env.kernel_count; i++) {
> +        if (av_strcasecmp(kernel_name, gpu_env.kernel_names[i]) == 0) {
> +            function = gpu_env.kernel_functions[i];
> +            break;
> +        }
> +    }

this could make use of a binary search (see libavutil/tree.c) for
better access times (log_2 versus linear). Not blocking since it can
be changed later with no interface modification.

> +    if (!function) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not find kernel: %s\n", kernel_name);
> +        return AVERROR(EINVAL);
> +    }
> +    return(function(userdata));
> +}
> +

> +int av_opencl_init(const char *build_option, AVOpenCLExternalInfo *ext_opencl_info)
> +{
> +    int ret;

> +    if (!isinited) {
> +        /*initialize devices, context, command_queue*/
> +        ret = init_opencl_env(&gpu_env, ext_opencl_info);
> +        if (ret) {
> +            return ret;
> +        }
> +        /*initialize program, kernel_name, kernel_count*/
> +        ret = compile_kernel_file(&gpu_env, build_option);
> +        if (ret) {
> +            return ret;
> +        }
> +        av_assert1(gpu_env.kernel_count > 0);
> +        isinited = 1;
> +    }
> +    return 0;
> +}


> +
> +void av_opencl_uninit(void)
> +{
> +    int status;
> +    if (!isinited)
> +        return;
> +    av_freep(&(gpu_env.temp_buffer));
> +    if (gpu_env.is_user_created)
> +        return;
> +    gpu_env.runtime_kernel_count--;
> +    if (!gpu_env.runtime_kernel_count) {
> +        if (gpu_env.program) {
> +            status = clReleaseProgram(gpu_env.program);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not release OpenCL program: %s\n", opencl_errstr(status));
> +            }
> +            gpu_env.program = NULL;
> +        }
> +        if (gpu_env.command_queue) {
> +            status = clReleaseCommandQueue(gpu_env.command_queue);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not release OpenCL command queue: %s\n", opencl_errstr(status));
> +            }
> +            gpu_env.command_queue = NULL;
> +        }
> +        if (gpu_env.context) {
> +            status = clReleaseContext(gpu_env.context);
> +            if (status != CL_SUCCESS) {
> +                av_log(&openclutils, AV_LOG_ERROR, "Could not release OpenCL context: %s\n", opencl_errstr(status));
> +            }
> +            gpu_env.context = NULL;
> +        }
> +        av_freep(&(gpu_env.device_ids));
> +        isinited = 0;
> +    }
> +}
> +
> +int av_opencl_is_inited(void)
> +{
> +    return isinited;
> +}
> +
> +void av_opencl_get_kernel_env(AVOpenCLKernelEnv *env)
> +{
> +    env->context       = gpu_env.context;
> +    env->command_queue = gpu_env.command_queue;
> +    env->program       = gpu_env.program;
> +}
> +
> +int av_opencl_buffer_create(void **cl_buf, size_t cl_buf_size, int flags, void *host_ptr)
> +{
> +    int status;
> +    *cl_buf = clCreateBuffer(gpu_env.context, flags, cl_buf_size, host_ptr, &status);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not create OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    return 0;
> +}
> +
> +void av_opencl_buffer_release(void *cl_buf)
> +{
> +    int status = 0;
> +    if (!cl_buf)
> +        return;
> +    status = clReleaseMemObject(cl_buf);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not release OpenCL buffer: %s\n", opencl_errstr(status));
> +    }
> +}
> +
> +int av_opencl_buffer_write(void *dst_cl_buf, uint8_t *src_buf, size_t buf_size)
> +{
> +    int status;
> +    void *mapped = clEnqueueMapBuffer(gpu_env.command_queue, dst_cl_buf,
> +                                      CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * buf_size,
> +                                      0, NULL, NULL, &status);
> +
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not map OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    memcpy(mapped, src_buf, buf_size);
> +
> +    status = clEnqueueUnmapMemObject(gpu_env.command_queue, dst_cl_buf, mapped, 0, NULL, NULL);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not unmap OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    return 0;
> +}
> +
> +int av_opencl_buffer_read(uint8_t *dst_buf, void *src_cl_buf, size_t buf_size)
> +{
> +    int status;
> +    void *mapped = clEnqueueMapBuffer(gpu_env.command_queue, src_cl_buf,
> +                                      CL_TRUE,CL_MAP_READ, 0, sizeof(uint8_t) * buf_size,
> +                                      0, NULL, NULL, &status);
> +
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not map OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    memcpy(dst_buf, mapped, buf_size);
> +
> +    status = clEnqueueUnmapMemObject(gpu_env.command_queue, src_cl_buf, mapped, 0, NULL, NULL);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not unmap OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    return 0;
> +}
> +
> +int av_opencl_buffer_write_image(void *dst_cl_buf, size_t cl_buffer_size,
> +                                        uint8_t **src_data, int *plane_size, int plane_num, int offset)
> +{
> +    int buffer_size = 0;
> +    uint8_t *temp;
> +    int status;
> +    void *mapped;
> +    int i;
> +    if ((unsigned int)plane_num > 8) {
> +        return AVERROR(EINVAL);
> +    }
> +    for (i = 0;i < plane_num;i++) {
> +        buffer_size += plane_size[i];
> +    }
> +    if (buffer_size > cl_buffer_size) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Cannot write image to OpenCL buffer: buffer too small\n");
> +        return AVERROR(EINVAL);
> +    }
> +    mapped = clEnqueueMapBuffer(gpu_env.command_queue, dst_cl_buf,
> +                                      CL_TRUE,CL_MAP_WRITE, 0, buffer_size + offset,
> +                                      0, NULL, NULL, &status);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not map OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    temp = mapped;
> +    temp += offset;
> +    for (i = 0; i < plane_num; i++) {
> +        memcpy(temp, src_data[i], plane_size[i]);
> +        temp += plane_size[i];
> +    }
> +    status = clEnqueueUnmapMemObject(gpu_env.command_queue, dst_cl_buf, mapped, 0, NULL, NULL);
> +    if (status != CL_SUCCESS) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Could not unmap OpenCL buffer: %s\n", opencl_errstr(status));
> +        return AVERROR_EXTERNAL;
> +    }
> +    return 0;
> +}
> +
> +int av_opencl_buffer_read_image(uint8_t **dst_data, int *plane_size, int plane_num,
> +                                       void *src_cl_inbuf, size_t cl_buffer_size)
> +{
> +    int buffer_size = 0;
> +    int ret = 0;
> +    uint8_t *temp;
> +    int i;
> +    if ((unsigned int)plane_num > 8) {
> +        return AVERROR(EINVAL);
> +    }
> +    for (i = 0;i < plane_num;i++) {
> +        buffer_size += plane_size[i];
> +    }
> +    if (buffer_size > cl_buffer_size) {
> +        av_log(&openclutils, AV_LOG_ERROR, "Cannot write image to CPU buffer: OpenCL buffer too small\n");
> +        return AVERROR_EXTERNAL;
> +    }
> +    if (!gpu_env.temp_buffer) {
> +        gpu_env.temp_buffer = av_malloc(buffer_size);
> +        if (!gpu_env.temp_buffer)
> +            return AVERROR(ENOMEM);
> +        gpu_env.temp_buffer_size = buffer_size;
> +    }
> +
> +    if(buffer_size > gpu_env.temp_buffer_size) {
> +        av_free(gpu_env.temp_buffer);
> +        gpu_env.temp_buffer = av_malloc(buffer_size);
> +        if (!gpu_env.temp_buffer)
> +            return AVERROR(ENOMEM);
> +        gpu_env.temp_buffer_size = buffer_size;
> +    }
> +    temp = gpu_env.temp_buffer;
> +    ret  = av_opencl_buffer_read(gpu_env.temp_buffer, src_cl_inbuf, buffer_size);

> +    if (!ret) {

if (ret < 0)

> +        for (i = 0;i < plane_num;i++) {
> +            memcpy(dst_data[i], temp, plane_size[i]);
> +            temp += plane_size[i];
> +        }
> +    }
> +    return ret;
> +}
> +
> +cl_device_id av_opencl_get_device_id(void)
> +{
> +    if (!gpu_env.is_user_created) {
> +        return *(gpu_env.device_ids);
> +    } else
> +        return gpu_env.device_id;
> +}
> +
> +cl_context av_opencl_get_context(void)
> +{
> +    return gpu_env.context;
> +}
> +
> +cl_command_queue av_opencl_get_command_queue(void)
> +{
> +    return gpu_env.command_queue;
> +}
> +
> diff --git a/libavutil/opencl.h b/libavutil/opencl.h
> new file mode 100644
> index 0000000..f5172dc
> --- /dev/null
> +++ b/libavutil/opencl.h
> @@ -0,0 +1,219 @@
> +/*
> + * Copyright (C) 2012 Peng Gao <peng at multicorewareinc.com>
> + * Copyright (C) 2012 Li   Cao <li at multicorewareinc.com>
> + * Copyright (C) 2012 Wei  Gao <weigao at multicorewareinc.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#ifndef LIBAVUTIL_OPENCLWRAPPER_H
> +#define LIBAVUTIL_OPENCLWRAPPER_H
> +
> +#include <CL/cl.h>

This is my understanding of how this OpenCL API works.

You register some code with a name (which are currently stored
in the global environment), with av_opencl_register_kernel().

Then the previously registered framments of code are compiled by
OpenCL, when doing: av_opencl_init() -> compile_kernel_file()

With av_opencl_init() you also specify some parameters (build_options)
which are used when compiling the code of the specified functions.

av_opencl_init() also creates the OpenCL program, which is stored in
the global environment. The program is unique for all the kernels
registered so far.

At this point you need to create an entry point for each kernel, to
run a specific *function* defined within it. This is done by creating
a kernel, with av_opencl_create_kernel()

av_opencl_create_kernel() is used to create a kernel (a sort of
handler to communicate with the *compiled* kernel). The kernel is
created specifying the name of the function to run *in the kernel
code*. The kernel is set in the passed AVOpenCLEnv environment.

In order to run a function specified in a kernel, you also need to
provide some parameters/data to it.

This is done through av_opencl_register_kernel_function(), which is
used to register a function which is associated to one of the
previously registered kernel in the global environment.

so we have: kernel(global env) -> function(global env)

To run the code of a kernel, av_opencl_run_kernel() must be called,
with the name of the registered kernel on which the function is to be
called.

This function lookups the functions registered in the global
environment, and executes the registered function with provided user
data/parameters, which in particular must contain the opencl
environment. The environment should contain the kernel handler created
with av_opencl_create_kerne(), and is used to set the arguments for
the function defined in the kernel code, and eventually run the code
for it (see the deshake patch for an example of such usage).

...

So basically this is the workflow:

kernel code registration (done in the global environment)            -> av_opencl_register_kernel()
kernel code compilation/init (always done in the global environment) -> av_opencl_init()

kernel function registration (can be eventually done *before* init)  -> av_opencl_register_kernel_function()
kernel object creation, which is required to run the code            -> av_opencl_create_kernel
kernel code execution with user data parameters                      -> av_opencl_run_kernel()

Cleanup:
kernel object (stored in an environment)                             -> av_opencl_release_kernel()
global environment                                                   -> av_opencl_uninit()

...

Can you confirm that this is an accurate description of the
design/workflow?

The main problem with this design is that different threads and
components can messup with the global environment.

For example you may want to init a filter, this creates a global
environment, then you create another filter/component which requires
to build a different kernel etc., which can't be done since you're
supposed to init the global environment just once.

Ideally we should have one OpenCL context per component, so we don't
need to know everything (kernel code and functions) when we init the
OpenCL system, and by using a global environment you are prevented
from doing that.

In a similar way, when you uninit the OpenCL system you don't know if
other components are actually using it, so the only safe way is to
uninit() it when you close the *application*, which is not ideal for a
library.

> +
> +#define AV_OPENCL_KERNEL( ... )# __VA_ARGS__
> +
> +#define AV_OPENCL_MAX_KERNEL_NAME_SIZE 150
> +
> +typedef struct AVOpenCLKernelEnv {
> +    cl_context context;
> +    cl_command_queue command_queue;
> +    cl_program program;
> +    cl_kernel kernel;
> +    char kernel_name[AV_OPENCL_MAX_KERNEL_NAME_SIZE];
> +} AVOpenCLKernelEnv;
> +
> +typedef struct AVOpenCLExternalInfo {
> +    cl_platform_id platform;
> +    cl_device_type device_type;
> +    cl_context context;
> +    cl_device_id *device_ids;
> +    cl_device_id  device_id;
> +    cl_command_queue command_queue;
> +    char *platform_name;
> +} AVOpenCLExternalInfo;
> +
> +/**
> + * User defined function, used to set the input parameter in the kernel
> + *environment. This function launches kernel and copies data from GPU to
> + *CPU, or from CPU to GPU.
> + */
> +typedef int (* av_opencl_kernel_function) (void **userdata);
> +
> +/**
> + * Register a function for running the kernel specified by the kernel name.
> + *@param kernel_name   this kernel name is used to find the kernel in OpenCL runtime environment.
> + *@param function         user defined function,should not be NULL, it is used to set the input parameter in the kernel environment
> + *@return  >=0 on success, a negative error value on failure
> + */
> +int av_opencl_register_kernel_function(const char *kernel_name, av_opencl_kernel_function function);
> +
> +/**
> + *Load OpenCL kernel.
> + *
> + *@param kernel_name   this kernel name is used to find the kernel in OpenCL runtime environment.
> + *@param userdata         this userdata is the all parameters for running the kernel specified by kernel name
> + *@return  >=0 on success, a negative error value on failure
> + */
> +int av_opencl_run_kernel(const char *kernel_name, void **userdata);
> +
> +/**
> + * Init the run time  OpenCL environment.
> + *
> + *This function must be called befor calling any function related to OpenCL.This function should be called by a single thread.
> + *
> + *
> + *@param build_option         option of compile the kernel in OpenCL runtime environment,reference "OpenCL Specification Version: 1.2 chapter 5.6.4"
> + *@param ext_opencl_info    this is the extern OpenCL environment which the application program has created
> + *@return  >=0 on success, a negative error value on failure
> + */
> +int av_opencl_init(const char *build_option, AVOpenCLExternalInfo *ext_opencl_info);

It would be better to pass an AVDictionary here, so we can add more
options (e.g. paths and security policy options) without breaking the
API later. Or this could be done with a sort of per-component context,
in a similar way with what is done for example with avcodec_open2()
(where we pass all the options through an AVDictionary).
-- 
FFmpeg = Fanciful and Fiendish MultiPurpose Exxagerate Genius


More information about the ffmpeg-devel mailing list