[FFmpeg-devel] [PATCH V3] Add a filter implementing HDR image reconstruction from a single exposure using deep CNNs

Fri Oct 19 18:32:38 EEST 2018

On Fri, Oct 19, 2018 at 10:11 AM Guo, Yejun <yejun.guo at intel.com> wrote:

> see the algorithm's paper and code below.
>
> the filter's parameter looks like:
> sdr2hdr=model_filename=/path_to_tensorflow_graph.pb:out_fmt=gbrp10le
>

can you add some usage documentation to doc/filters.texi?

The input of the deep CNN model is RGB24 while the output is float
> for each color channel. This is the filter's default behavior to
> output format with gbrpf32le. And gbrp10le is also supported as the
> output, so we can see the rendering result in a player, as a reference.
>
> To generate the model file, we need modify the original script a little.
> - set name='y' for y_final within script at
> https://github.com/gabrieleilertsen/hdrcnn/blob/master/network.py
> - add the following code to the script at
> https://github.com/gabrieleilertsen/hdrcnn/blob/master/hdrcnn_predict.py
>
> graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
> ["y"])
> tf.train.write_graph(graph, '.', 'graph.pb', as_text=False)
>
> The filter only works when tensorflow C api is supported in the system,
> native backend is not supported since there are some different types of
> layers in the deep CNN model, besides CONV and DEPTH_TO_SPACE.
>
> https://arxiv.org/pdf/1710.07480.pdf:
>   author       = "Eilertsen, Gabriel and Kronander, Joel, and Denes,
> Gyorgy and Mantiuk, Rafał and Unger, Jonas",
>   title        = "HDR image reconstruction from a single exposure using
> deep CNNs",
>   journal      = "ACM Transactions on Graphics (TOG)",
>   number       = "6",
>   volume       = "36",
>   articleno    = "178",
>   year         = "2017"
>
> https://github.com/gabrieleilertsen/hdrcnn
>
> btw, as a whole solution, metadata should also be generated from
> the sdr video, so to be encoded as a HDR video. Not supported yet.
> This patch just focuses on this paper.
>

Is this something you are working on and will it be added later?

> v3: use int16_t instead of short
> v2: use AV_OPT_TYPE_PIXEL_FMT for filter option
>     remove some unnecessary code
>     Use in->linesize[0] and FFMAX/FFMIN
>     remove flag AVFILTER_FLAG_SLICE_THREADS
>     add av_log message when error
>

there is no need for this block to be left in the commit log

> Signed-off-by: Guo, Yejun <yejun.guo at intel.com>
> ---
>  libavfilter/Makefile     |   1 +
>  libavfilter/allfilters.c |   1 +
>  libavfilter/vf_sdr2hdr.c | 266
> +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 268 insertions(+)
>  create mode 100644 libavfilter/vf_sdr2hdr.c
>
> +static av_cold int init(AVFilterContext* context)
> +{
> +    SDR2HDRContext* ctx = context->priv;
> +
> +    if (ctx->out_fmt != AV_PIX_FMT_GBRPF32LE && ctx->out_fmt !=
> AV_PIX_FMT_GBRP10LE) {
> +        av_log(context, AV_LOG_ERROR, "could not support the output
> format\n");
> +        return AVERROR(ENOSYS);
> +    }
> +
> +#if (CONFIG_LIBTENSORFLOW == 1)
> +    ctx->dnn_module = ff_get_dnn_module(DNN_TF);
> +    if (!ctx->dnn_module){
> +        av_log(context, AV_LOG_ERROR, "could not create DNN module for
> tensorflow backend\n");
> +        return AVERROR(ENOMEM);
> +    }
> +    if (!ctx->model_filename){
> +        av_log(context, AV_LOG_ERROR, "model file for network was not
> specified\n");
> +        return AVERROR(EIO);
> +    }
> +    if (!ctx->dnn_module->load_model) {
> +        av_log(context, AV_LOG_ERROR, "load_model for network was not
> specified\n");
> +        return AVERROR(EIO);
> +    }
> +    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);
> +    if (!ctx->model){
> +        av_log(context, AV_LOG_ERROR, "could not load DNN model\n");
> +        return AVERROR(EIO);
> +    }
> +    return 0;
> +#else
> +    return AVERROR(EIO);
> +#endif
> +}
>

this is incorrect, what you should do is make libtensorflow a dependency of
this filter in the configure file and disable this filter when it is not
enabled

> +
> +static int query_formats(AVFilterContext* context)
> +{
> +    const enum AVPixelFormat in_formats[] = {AV_PIX_FMT_RGB24,
> +                                             AV_PIX_FMT_NONE};
> +    enum AVPixelFormat out_formats[2];
> +    SDR2HDRContext* ctx = context->priv;
> +    AVFilterFormats* formats_list;
> +    int ret = 0;
> +
> +    formats_list = ff_make_format_list(in_formats);
> +    if ((ret = ff_formats_ref(formats_list,
> &context->inputs[0]->out_formats)) < 0)
> +        return ret;
> +
> +    out_formats[0] = ctx->out_fmt;
> +    out_formats[1] = AV_PIX_FMT_NONE;
> +    formats_list = ff_make_format_list(out_formats);
> +    if ((ret = ff_formats_ref(formats_list,
> &context->outputs[0]->in_formats)) < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int config_props(AVFilterLink* inlink)
> +{
> +    AVFilterContext* context = inlink->dst;
> +    SDR2HDRContext* ctx = context->priv;
> +    AVFilterLink* outlink = context->outputs[0];
> +    DNNReturnType result;
> +
> +    // the dnn model is tied with resolution due to deconv layer of
> tensorflow
> +    // now just support 1920*1080 and so the magic numbers within this
> file
> +    if (inlink->w != 1920 || inlink->h != 1080) {
> +        av_log(context, AV_LOG_ERROR, "only support frame size with
> 1920*1080\n");
> +        return AVERROR(ENOSYS);
> +     }
>

is there any work planned to extend this to other resolutions?

> +
> +    ctx->input.width = 1920;
> +    ctx->input.height = 1088;  //the model requires height is a multiple
> of 32,
> +    ctx->input.channels = 3;
> +
> +    result = (ctx->model->set_input_output)(ctx->model->model,
> &ctx->input, &ctx->output);
> +    if (result != DNN_SUCCESS){
> +        av_log(context, AV_LOG_ERROR, "could not set input and output for
> the model\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    memset(ctx->input.data, 0, ctx->input.channels * ctx->input.width *
> ctx->input.height * sizeof(float));
> +    outlink->h = 1080;
> +    outlink->w = 1920;
> +    return 0;
> +}
> +
> +static float qsort_comparison_function_float(const void *a, const void *b)
> +{
> +    return *(const float *)a - *(const float *)b;
> +}
> +
> +static int filter_frame(AVFilterLink* inlink, AVFrame* in)
> +{
> +    DNNReturnType dnn_result = DNN_SUCCESS;
> +    AVFilterContext* context = inlink->dst;
> +    SDR2HDRContext* ctx = context->priv;
> +    AVFilterLink* outlink = context->outputs[0];
> +    AVFrame* out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    int total_pixels = in->height * in->width;
> +
> +    av_frame_copy_props(out, in);
>

check for allocation failures here

> +
> +    for (int i = 0; i < in->linesize[0] * in->height; ++i) {
> +        ctx->input.data[i] = in->data[0][i] / 255.0f;
> +    }
> +
> +    dnn_result = (ctx->dnn_module->execute_model)(ctx->model);
> +    if (dnn_result != DNN_SUCCESS){
> +        av_log(context, AV_LOG_ERROR, "failed to execute loaded model\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    if (ctx->out_fmt == AV_PIX_FMT_GBRPF32LE) {
> +        float* outg = (float*)out->data[0];
> +        float* outb = (float*)out->data[1];
> +        float* outr = (float*)out->data[2];
> +        for (int i = 0; i < total_pixels; ++i) {
> +            float r = ctx->output.data[i*3];
> +            float g = ctx->output.data[i*3+1];
> +            float b = ctx->output.data[i*3+2];
> +            outr[i] = r;
> +            outg[i] = g;
> +            outb[i] = b;
> +        }
> +    } else {
> +        // here, we just use a rough mapping to the 10bit contents
> +        // meta data generation for HDR video encoding is not supported
> yet
> +        float* converted_data = (float*)malloc(total_pixels * 3 *
> sizeof(float));
>

don't use malloc, replace with av_malloc, same for free below

> +        int16_t* outg = (int16_t*)out->data[0];
> +        int16_t* outb = (int16_t*)out->data[1];
> +        int16_t* outr = (int16_t*)out->data[2];
> +
> +        float max = 1.0f;
> +        for (int i = 0; i < total_pixels * 3; ++i) {
> +            float d = ctx->output.data[i];
> +            d = sqrt(d);
> +            converted_data[i] = d;
> +            max = FFMAX(d, max);
> +        }
> +
> +        if (max > 1.0f) {
> +            AV_QSORT(converted_data, total_pixels * 3, float,
> qsort_comparison_function_float);
> +            // 0.5% pixels are clipped
> +            max = converted_data[(int)(total_pixels * 3 * 0.995)];
> +            max = FFMAX(max, 1.0f);
> +
> +            for (int i = 0; i < total_pixels * 3; ++i) {
> +                float d = ctx->output.data[i];
> +                d = sqrt(d);
> +                d = FFMIN(d, max);
> +                converted_data[i] = d;
> +            }
> +        }
> +
> +        for (int i = 0; i < total_pixels; ++i) {
> +            float r = converted_data[i*3];
> +            float g = converted_data[i*3+1];
> +            float b = converted_data[i*3+2];
> +            outr[i] = r / max * 1023;
> +            outg[i] = g / max * 1023;
> +            outb[i] = b / max * 1023;
> +        }
> +
> +        free(converted_data);
> +    }
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +}
> +
> +static av_cold void uninit(AVFilterContext* context)
> +{
> +    SDR2HDRContext* ctx = context->priv;
> +
> +    if (ctx->dnn_module){
> +        (ctx->dnn_module->free_model)(&ctx->model);
> +        av_freep(&ctx->dnn_module);
> +    }
> +}
> +
> +static const AVFilterPad sdr2hdr_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = config_props,
> +        .filter_frame = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad sdr2hdr_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_sdr2hdr = {
> +    .name          = "sdr2hdr",
> +    .description   = NULL_IF_CONFIG_SMALL("HDR image reconstruction from
> a single exposure using deep CNNs."),
>

why "reconstruction"? there is nothing to construct back if the source
wasn't hdr to begin with
"tonemap" is probably a better term here, in my opinion
same for previous uses
-- 
Vittorio