[FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

Mon Jun 4 02:20:28 EEST 2018

On 29/05/18 06:54, Ruiling Song wrote:
> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
> 
> An example command to use this filter with vaapi codecs:
> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
> opencl=ocl at va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
> vaapi -i INPUT -filter_hw_device ocl -filter_complex \
> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
> 
> v2:
> add peak detection.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  configure                              |   1 +
>  libavfilter/Makefile                   |   2 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/colorspace_basic.c         |  89 +++++
>  libavfilter/colorspace_basic.h         |  40 ++
>  libavfilter/opencl/colorspace_basic.cl | 187 ++++++++++
>  libavfilter/opencl/tonemap.cl          | 278 ++++++++++++++
>  libavfilter/opencl_source.h            |   2 +
>  libavfilter/vf_tonemap_opencl.c        | 655 +++++++++++++++++++++++++++++++++
>  9 files changed, 1255 insertions(+)
>  create mode 100644 libavfilter/colorspace_basic.c
>  create mode 100644 libavfilter/colorspace_basic.h
>  create mode 100644 libavfilter/opencl/colorspace_basic.cl
>  create mode 100644 libavfilter/opencl/tonemap.cl
>  create mode 100644 libavfilter/vf_tonemap_opencl.c

This segfaults when run on CPU implementations (both AMD and Intel on Windows) - can you check that?  Maybe an out-of-bounds memory reference which doesn't get noticed on a GPU.  (Apologies for the terrible report - I can only see it on opaque proprietary drivers, where it dies on some internal thread with no information at all.  The filter unfortunately can't run on pocl because of lack of R/RG support there.)

Still not sure why it fails on Mali (it doesn't feel like it uses a lot of memory so I'm not sure what's going wrong), but it does work well on AMD on Windows.

What set of implementations have you tested on?

> diff --git a/configure b/configure
> index e52f8f8..ee3586b 100755
> --- a/configure
> +++ b/configure
> @@ -3401,6 +3401,7 @@ tinterlace_filter_deps="gpl"
>  tinterlace_merge_test_deps="tinterlace_filter"
>  tinterlace_pad_test_deps="tinterlace_filter"
>  tonemap_filter_deps="const_nan"
> +tonemap_opencl_filter_deps="opencl"
>  unsharp_opencl_filter_deps="opencl"
>  uspp_filter_deps="gpl avcodec"
>  vaguedenoiser_filter_deps="gpl"
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index c68ef05..0915656 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -352,6 +352,8 @@ OBJS-$(CONFIG_TINTERLACE_FILTER)             += vf_tinterlace.o
>  OBJS-$(CONFIG_TLUT2_FILTER)                  += vf_lut2.o framesync.o
>  OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
>  OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
> +OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o colorspace_basic.o opencl.o \
> +                                                opencl/tonemap.o opencl/colorspace_basic.o
>  OBJS-$(CONFIG_TRANSPOSE_FILTER)              += vf_transpose.o
>  OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
>  OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b44093d..6873bab 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -343,6 +343,7 @@ extern AVFilter ff_vf_tinterlace;
>  extern AVFilter ff_vf_tlut2;
>  extern AVFilter ff_vf_tmix;
>  extern AVFilter ff_vf_tonemap;
> +extern AVFilter ff_vf_tonemap_opencl;
>  extern AVFilter ff_vf_transpose;
>  extern AVFilter ff_vf_trim;
>  extern AVFilter ff_vf_unpremultiply;
> diff --git a/libavfilter/colorspace_basic.c b/libavfilter/colorspace_basic.c
> new file mode 100644
> index 0000000..93f9f08
> --- /dev/null
> +++ b/libavfilter/colorspace_basic.c

The name of this file feels strange to me.  It's common parts used by colorspace-related filters, so maybe just colorspace.c?

> @@ -0,0 +1,89 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "colorspace_basic.h"
> +
> +
> +void invert_matrix3x3(const double in[3][3], double out[3][3])
> +{
> +    double m00 = in[0][0], m01 = in[0][1], m02 = in[0][2],
> +           m10 = in[1][0], m11 = in[1][1], m12 = in[1][2],
> +           m20 = in[2][0], m21 = in[2][1], m22 = in[2][2];
> +    int i, j;
> +    double det;
> +
> +    out[0][0] =  (m11 * m22 - m21 * m12);
> +    out[0][1] = -(m01 * m22 - m21 * m02);
> +    out[0][2] =  (m01 * m12 - m11 * m02);
> +    out[1][0] = -(m10 * m22 - m20 * m12);
> +    out[1][1] =  (m00 * m22 - m20 * m02);
> +    out[1][2] = -(m00 * m12 - m10 * m02);
> +    out[2][0] =  (m10 * m21 - m20 * m11);
> +    out[2][1] = -(m00 * m21 - m20 * m01);
> +    out[2][2] =  (m00 * m11 - m10 * m01);
> +
> +    det = m00 * out[0][0] + m10 * out[0][1] + m20 * out[0][2];
> +    det = 1.0 / det;
> +
> +    for (i = 0; i < 3; i++) {
> +        for (j = 0; j < 3; j++)
> +            out[i][j] *= det;
> +    }
> +}
> +
> +void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3])
> +{
> +    int m, n;
> +
> +    for (m = 0; m < 3; m++)
> +        for (n = 0; n < 3; n++)
> +            dst[m][n] = src2[m][0] * src1[0][n] +
> +                        src2[m][1] * src1[1][n] +
> +                        src2[m][2] * src1[2][n];
> +}
> +/*
> + * see e.g. http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
> + */
> +void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
> +                        const struct WhitePoint *wp,
> +                        double rgb2xyz[3][3])
> +{
> +    double i[3][3], sr, sg, sb, zw;
> +
> +    rgb2xyz[0][0] = coeffs->xr / coeffs->yr;
> +    rgb2xyz[0][1] = coeffs->xg / coeffs->yg;
> +    rgb2xyz[0][2] = coeffs->xb / coeffs->yb;
> +    rgb2xyz[1][0] = rgb2xyz[1][1] = rgb2xyz[1][2] = 1.0;
> +    rgb2xyz[2][0] = (1.0 - coeffs->xr - coeffs->yr) / coeffs->yr;
> +    rgb2xyz[2][1] = (1.0 - coeffs->xg - coeffs->yg) / coeffs->yg;
> +    rgb2xyz[2][2] = (1.0 - coeffs->xb - coeffs->yb) / coeffs->yb;
> +    invert_matrix3x3(rgb2xyz, i);
> +    zw = 1.0 - wp->xw - wp->yw;
> +    sr = i[0][0] * wp->xw + i[0][1] * wp->yw + i[0][2] * zw;
> +    sg = i[1][0] * wp->xw + i[1][1] * wp->yw + i[1][2] * zw;
> +    sb = i[2][0] * wp->xw + i[2][1] * wp->yw + i[2][2] * zw;
> +    rgb2xyz[0][0] *= sr;
> +    rgb2xyz[0][1] *= sg;
> +    rgb2xyz[0][2] *= sb;
> +    rgb2xyz[1][0] *= sr;
> +    rgb2xyz[1][1] *= sg;
> +    rgb2xyz[1][2] *= sb;
> +    rgb2xyz[2][0] *= sr;
> +    rgb2xyz[2][1] *= sg;
> +    rgb2xyz[2][2] *= sb;
> +}

Since this is copied from vf_colorspace.c, please remove the static versions from there at the same time.

Also, you should check whether any copyright statement needs to be propgated into this file.

> diff --git a/libavfilter/colorspace_basic.h b/libavfilter/colorspace_basic.h
> new file mode 100644
> index 0000000..5647ca6
> --- /dev/null
> +++ b/libavfilter/colorspace_basic.h
> @@ -0,0 +1,40 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVFILTER_COLORSPACE_BASIC_H
> +#define AVFILTER_COLORSPACE_BASIC_H
> +
> +#include "libavutil/common.h"
> +
> +struct LumaCoefficients {
> +    double cr, cg, cb;
> +};
> +
> +struct ColorPrimaries {
> +    double xr, yr, xg, yg, xb, yb;
> +};
> +
> +struct WhitePoint {
> +    double xw, yw;
> +};
> +
> +void invert_matrix3x3(const double in[3][3], double out[3][3]);
> +void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3]);
> +void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
> +                        const struct WhitePoint *wp, double rgb2xyz[3][3]);
> +#endif
> diff --git a/libavfilter/opencl/colorspace_basic.cl b/libavfilter/opencl/colorspace_basic.cl
> new file mode 100644
> index 0000000..eaea253
> --- /dev/null
> +++ b/libavfilter/opencl/colorspace_basic.cl
> @@ -0,0 +1,187 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define ST2084_MAX_LUMINANCE 10000.0f
> +#define REFERENCE_WHITE 100.0f
> +constant const float ST2084_M1 = 0.1593017578125f;
> +constant const float ST2084_M2 = 78.84375f;
> +constant const float ST2084_C1 = 0.8359375f;
> +constant const float ST2084_C2 = 18.8515625f;
> +constant const float ST2084_C3 = 18.6875f;
> +
> +__constant float yuv2rgb_bt2020[] = {
> +    1.0f, 0.0f, 1.4746f,
> +    1.0f, -0.16455f, -0.57135f,
> +    1.0f, 1.8814f, 0.0f
> +};
> +
> +__constant float yuv2rgb_bt709[] = {
> +    1.0f, 0.0f, 1.5748f,
> +    1.0f, -0.18732f, -0.46812f,
> +    1.0f, 1.8556f, 0.0f
> +};
> +
> +__constant float rgb2yuv_bt709[] = {
> +    0.2126f, 0.7152f, 0.0722f,
> +    -0.11457f, -0.38543f, 0.5f,
> +    0.5f, -0.45415f, -0.04585f
> +};

These don't look like the matrices I would expect for BT.709.  Can you explain where they come from?  (I think I must be missing some subtlety here.)

> +
> +__constant float rgb2yuv_bt2020[] ={
> +    0.2627f, 0.678f, 0.0593f,
> +    -0.1396f, -0.36037f, 0.5f,
> +    0.5f, -0.4598f, -0.0402f,
> +};
> +
> +
> +float get_luma_dst(float3 c) {
> +    return luma_dst.x * c.x + luma_dst.y * c.y + luma_dst.z * c.z;
> +}
> +
> +float get_luma_src(float3 c) {
> +    return luma_src.x * c.x + luma_src.y * c.y + luma_src.z * c.z;
> +}
> +
> +float eotf_st2084(float x) {
> +    float p = powr(x, 1.0f / ST2084_M2);
> +    float a = max(p -ST2084_C1, 0.0f);
> +    float b = max(ST2084_C2 - ST2084_C3 * p, 1e-6f);
> +    float c  = powr(a / b, 1.0f / ST2084_M1);
> +    return x > 0.0f ? c * ST2084_MAX_LUMINANCE / REFERENCE_WHITE : 0.0f;
> +}
> +
> +__constant const float HLG_A = 0.17883277f;
> +__constant const float HLG_B = 0.28466892f;
> +__constant const float HLG_C = 0.55991073f;
> +
> +// linearizer for HLG
> +float inverse_oetf_hlg(float x) {
> +    float a = 4.0f * x * x;
> +    float b = exp((x - HLG_C) / HLG_A) + HLG_B;
> +    return x < 0.5f ? a : b;
> +}
> +
> +// delinearizer for HLG
> +float oetf_hlg(float x) {
> +    float a = 0.5f * sqrt(x);
> +    float b = HLG_A * log(x - HLG_B) + HLG_C;
> +    return x <= 1.0f ? a : b;
> +}
> +
> +float3 ootf_hlg(float3 c, float peak) {
> +    float luma = get_luma_src(c);
> +    float gamma =  1.2f + 0.42f * log10(peak * REFERENCE_WHITE / 1000.0f);
> +    gamma = max(1.0f, gamma);
> +    float factor = peak * powr(luma, gamma - 1.0f) / powr(12.0f, gamma);
> +    return c * factor;
> +}
> +
> +float3 inverse_ootf_hlg(float3 c, float peak) {
> +    float gamma = 1.2f + 0.42f * log10(peak * REFERENCE_WHITE / 1000.0f);
> +    c *=  powr(12.0f, gamma) / peak;
> +    c /= powr(get_luma_dst(c), (gamma - 1.0f) / gamma);
> +    return c;
> +}
> +
> +float inverse_eotf_bt1886(float c) {
> +    return c < 0.0f ? 0.0f : powr(c, 1.0f / 2.4f);
> +}
> +
> +float oetf_bt709(float c) {
> +    c = c < 0.0f ? 0.0f : c;
> +    float r1 = 4.5f * c;
> +    float r2 = 1.099f * powr(c, 0.45f) - 0.099f;
> +    return c < 0.018f ? r1 : r2;
> +}
> +float inverse_oetf_bt709(float c) {
> +    float r1 = c / 4.5f;
> +    float r2 = powr((c + 0.099f) / 1.099f, 1.0f / 0.45f);
> +    return c < 0.081f ? r1 : r2;
> +}
> +
> +float3 yuv2rgb(float y, float u, float v) {
> +#ifdef FULL_RANGE_IN
> +    u -= 0.5f; v -= 0.5f;
> +#else
> +    y = (y * 255.0f -  16.0f) / 219.0f;
> +    u = (u * 255.0f - 128.0f) / 224.0f;
> +    v = (v * 255.0f - 128.0f) / 224.0f;
> +#endif
> +    float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2];
> +    float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5];
> +    float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8];
> +    return (float3)(r, g, b);
> +}
> +
> +float3 yuv2lrgb(float3 yuv) {
> +    float3 rgb = yuv2rgb(yuv.x, yuv.y, yuv.z);
> +    float r = linearize(rgb.x);
> +    float g = linearize(rgb.y);
> +    float b = linearize(rgb.z);
> +    return (float3)(r, g, b);
> +}
> +
> +float3 rgb2yuv(float r, float g, float b) {
> +    float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2];
> +    float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5];
> +    float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8];
> +#ifdef FULL_RANGE_OUT
> +    u += 0.5f; v += 0.5f;
> +#else
> +    y = (219.0f * y + 16.0f) / 255.0f;
> +    u = (224.0f * u + 128.0f) / 255.0f;
> +    v = (224.0f * v + 128.0f) / 255.0f;
> +#endif
> +    return (float3)(y, u, v);
> +}
> +
> +float3 lrgb2yuv(float3 c) {
> +    float r = delinearize(c.x);
> +    float g = delinearize(c.y);
> +    float b = delinearize(c.z);
> +
> +    return rgb2yuv(r, g, b);
> +}
> +
> +float3 lrgb2lrgb(float3 c) {
> +#ifdef RGB2RGB_PASSTHROUGH
> +    return c;
> +#else
> +    float r = c.x, g = c.y, b = c.z;
> +    float rr = rgb2rgb[0] * r + rgb2rgb[1] * g + rgb2rgb[2] * b;
> +    float gg = rgb2rgb[3] * r + rgb2rgb[4] * g + rgb2rgb[5] * b;
> +    float bb = rgb2rgb[6] * r + rgb2rgb[7] * g + rgb2rgb[8] * b;
> +    return (float3)(rr, gg, bb);
> +#endif
> +}
> +
> +float3 ootf(float3 c, float peak) {
> +#ifdef ootf_impl
> +    return ootf_impl(c, peak);
> +#else
> +    return c;
> +#endif
> +}
> +
> +float3 inverse_ootf(float3 c, float peak) {
> +#ifdef inverse_ootf_impl
> +    return inverse_ootf_impl(c, peak);
> +#else
> +    return c;
> +#endif
> +}
> diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl
> new file mode 100644
> index 0000000..f88bc40
> --- /dev/null
> +++ b/libavfilter/opencl/tonemap.cl
> @@ -0,0 +1,278 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define REFERENCE_WHITE 100.0f
> +extern float3 lrgb2yuv(float3);
> +extern float3 yuv2lrgb(float3);
> +extern float3 lrgb2lrgb(float3);
> +extern float get_luma_src(float3);
> +extern float get_luma_dst(float3);
> +extern float3 ootf(float3 c, float peak);
> +extern float3 inverse_ootf(float3 c, float peak);
> +struct detection_result {
> +    float peak;
> +    float average;
> +};
> +
> +float hable_f(float in) {
> +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
> +    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
> +}
> +
> +float direct(float s, float peak) {
> +    return s;
> +}
> +
> +float linear(float s, float peak) {
> +    return s * tone_param / peak;
> +}
> +
> +float gamma(float s, float peak) {
> +    float p = s > 0.05f ? s /peak : 0.05f / peak;
> +    float v = pow(p, 1.0f / tone_param);
> +    return s > 0.05f ? v : (s * v /0.05f);
> +}
> +
> +float clip(float s, float peak) {
> +    return clamp(s * tone_param, 0.0f, 1.0f);
> +}
> +
> +float reinhard(float s, float peak) {
> +    return s / (s + tone_param) * (peak + tone_param) / peak;
> +}
> +
> +float hable(float s, float peak) {
> +    return hable_f(s)/hable_f(peak);
> +}
> +
> +float mobius(float s, float peak) {
> +    float j = tone_param;
> +    float a, b;
> +
> +    if (s <= j)
> +        return s;
> +
> +    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
> +    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
> +
> +    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
> +}
> +
> +// detect peak/average signal of a frame, the algorithm was ported from:
> +// libplacebo (https://github.com/haasn/libplacebo)
> +struct detection_result
> +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
> +            float signal, float peak) {
> +// layout of the util buffer
> +//
> +// Name:             : Size (units of 4-bytes)
> +// average buffer    : detection_frames + 1
> +// peak buffer       : detection_frames + 1
> +// workgroup counter : 1
> +// total of peak     : 1
> +// total of average  : 1
> +// frame index       : 1
> +// frame number      : 1
> +    global uint *avg_buf = util_buf;
> +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
> +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
> +    global uint *max_total_p = counter_wg_p + 1;
> +    global uint *avg_total_p = max_total_p + 1;
> +    global uint *frame_idx_p = avg_total_p + 1;
> +    global uint *scene_frame_num_p = frame_idx_p + 1;
> +
> +    uint frame_idx = *frame_idx_p;
> +    uint scene_frame_num = *scene_frame_num_p;
> +
> +    size_t lidx = get_local_id(0);
> +    size_t lidy = get_local_id(1);
> +    size_t lsizex = get_local_size(0);
> +    size_t lsizey = get_local_size(1);
> +    uint num_wg = get_num_groups(0) * get_num_groups(1);
> +    size_t group_idx = get_group_id(0);
> +    size_t group_idy = get_group_id(1);
> +    struct detection_result r = {peak, sdr_avg};
> +    if (lidx == 0 && lidy == 0)
> +        *sum_wg = 0;
> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update workgroup sum
> +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update frame peak/avg using work-group-average.
> +    if (lidx == 0 && lidy == 0) {
> +        uint avg_wg = *sum_wg / (lsizex * lsizey);
> +        atomic_max(&peak_buf[frame_idx], avg_wg);
> +        atomic_add(&avg_buf[frame_idx], avg_wg);
> +    }
> +
> +    if (scene_frame_num > 0) {
> +        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        r.peak = max(1.0f, peak);
> +        r.average = max(0.25f, avg);
> +    }
> +
> +    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
> +        *counter_wg_p = 0;
> +        avg_buf[frame_idx] /= num_wg;
> +
> +        if (scene_threshold > 0.0f) {
> +            uint cur_max = peak_buf[frame_idx];
> +            uint cur_avg = avg_buf[frame_idx];
> +            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
> +
> +            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  avg_buf[i] = 0;
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  peak_buf[i] = 0;
> +                *avg_total_p = *max_total_p = 0;
> +                *scene_frame_num_p = 0;
> +                avg_buf[frame_idx] = cur_avg;
> +                peak_buf[frame_idx] = cur_max;
> +            }
> +        }
> +        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
> +        // add current frame, subtract next frame
> +        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
> +        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
> +        // reset next frame
> +        peak_buf[next] = avg_buf[next] = 0;
> +        *frame_idx_p = next;
> +        *scene_frame_num_p = min(*scene_frame_num_p + 1, (uint)DETECTION_FRAMES);
> +    }
> +    return r;
> +}
> +
> +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
> +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
> +
> +    // Rescale the variables in order to bring it into a representation where
> +    // 1.0 represents the dst_peak. This is because all of the tone mapping
> +    // algorithms are defined in such a way that they map to the range [0.0, 1.0].
> +    if (target_peak > 1.0f) {
> +        sig *= 1.0f / target_peak;
> +        peak *= 1.0f / target_peak;
> +    }
> +
> +    float sig_old = sig;
> +
> +    // Scale the signal to compensate for differences in the average brightness
> +    float slope = min(1.0f, sdr_avg / average);
> +    sig *= slope;
> +    peak *= slope;
> +
> +    // Desaturate the color using a coefficient dependent on the signal level
> +    if (desat_param > 0.0f) {
> +        float luma = get_luma_dst(rgb);
> +        float coeff = max(sig - 0.18f, 1e-6f) / max(sig, 1e-6f);
> +        coeff = native_powr(coeff, 10.0f / desat_param);
> +        rgb = mix(rgb, (float3)luma, (float3)coeff);
> +        sig = mix(sig, luma * slope, coeff);
> +    }
> +
> +    sig = TONE_FUNC(sig, peak);
> +
> +    sig = min(sig, 1.0f);
> +    rgb *= (sig/sig_old);
> +    return rgb;
> +}
> +// map from source space YUV to destination space RGB
> +float3 map_to_dst_space_from_yuv(float3 yuv, float peak) {
> +    float3 c = yuv2lrgb(yuv);
> +    c = ootf(c, peak);
> +    c = lrgb2lrgb(c);
> +    return c;
> +}
> +
> +// convert from rgb to yuv, with possible inverse-ootf
> +float3 convert_to_yuv(float3 c, float peak) {
> +    c = inverse_ootf(c, peak);
> +    return lrgb2yuv(c);
> +}
> +
> +__kernel void tonemap(__write_only image2d_t dst1,
> +                      __write_only image2d_t dst2,
> +                      __read_only  image2d_t src1,
> +                      __read_only  image2d_t src2,
> +#ifdef THIRD_PLANE
> +                      __write_only image2d_t dst3,
> +                      __read_only  image2d_t src3,
> +#endif

THIRD_PLANE isn't currently set anywhere.  I think either make it work (add the pixel formats) or remove the unused code.

> +                      global uint *util_buf,
> +                      float peak
> +                      )
> +{
> +    __local uint sum_wg;
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int xi = get_global_id(0);
> +    int yi = get_global_id(1);
> +    // each work item process four pixels
> +    int x = 2 * xi;
> +    int y = 2 * yi;
> +
> +    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
> +    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
> +    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
> +    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
> +#ifdef THIRD_PLANE
> +    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;
> +    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;
> +    float2 uv = (float2)(u, v);
> +#else
> +    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
> +#endif
> +
> +    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y), peak);
> +    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y), peak);
> +    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y), peak);
> +    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y), peak);
> +
> +    float sig0 = max(c0.x, max(c0.y, c0.z));
> +    float sig1 = max(c1.x, max(c1.y, c1.z));
> +    float sig2 = max(c2.x, max(c2.y, c2.z));
> +    float sig3 = max(c3.x, max(c3.y, c3.z));
> +    float sig = max(sig0, max(sig1, max(sig2, sig3)));
> +
> +    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
> +
> +    float3 c0_old = c0, c1_old = c1, c2_old = c2;
> +    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
> +    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
> +    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
> +    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
> +
> +    float3 yuv0 = convert_to_yuv(c0, target_peak);
> +    float3 yuv1 = convert_to_yuv(c1, target_peak);
> +    float3 yuv2 = convert_to_yuv(c2, target_peak);
> +    float3 yuv3 = convert_to_yuv(c3, target_peak);
> +
> +    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));
> +#ifdef THIRD_PLANE
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));
> +#else
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));
> +#endif
> +}

I'm not sure if it makes any significant difference, but should the chroma sampling location be taken into account here?  This is reading as if a centre value and writing as if top-left, but most things will probably be centre-left.

> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4bb9969..c5b3f37 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -21,7 +21,9 @@
>  
>  extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_convolution;
> +extern const char *ff_opencl_source_colorspace_basic;
>  extern const char *ff_opencl_source_overlay;
> +extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_unsharp;
>  
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c
> new file mode 100644
> index 0000000..1a5bb20
> --- /dev/null
> +++ b/libavfilter/vf_tonemap_opencl.c
> @@ -0,0 +1,655 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/bprint.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mastering_display_metadata.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +#include "colorspace_basic.h"
> +
> +//#define DEBUG
> +// TODO:
> +// - seperate peak-detection from tone-mapping kernel to solve
> +//    one-frame-delay issue.
> +// - import colorspace matrix generation from vf_colorspace.c
> +// - more format support
> +
> +#define DETECTION_FRAMES 63
> +#define REFERENCE_WHITE 100.0f
> +
> +enum TonemapAlgorithm {
> +    TONEMAP_NONE,
> +    TONEMAP_LINEAR,
> +    TONEMAP_GAMMA,
> +    TONEMAP_CLIP,
> +    TONEMAP_REINHARD,
> +    TONEMAP_HABLE,
> +    TONEMAP_MOBIUS,
> +    TONEMAP_MAX,
> +};
> +
> +typedef struct TonemapOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
> +    enum AVColorTransferCharacteristic trc, trc_in, trc_out;
> +    enum AVColorPrimaries primaries, primaries_in, primaries_out;
> +    enum AVColorRange range, range_in, range_out;
> +
> +    enum TonemapAlgorithm tonemap;
> +    enum AVPixelFormat    format;
> +    double                peak;
> +    double                param;
> +    double                desat_param;
> +    double                target_peak;
> +    double                scene_threshold;
> +    int                   initialised;
> +    cl_kernel             kernel;
> +    cl_command_queue      command_queue;
> +    cl_mem                util_mem;
> +} TonemapOpenCLContext;
> +
> +const char *yuv_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",
> +};
> +
> +const char *rgb_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",
> +};
> +
> +const char *linearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",
> +};
> +
> +const char *delinearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",
> +    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",
> +};
> +
> +static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },
> +    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },
> +};
> +
> +struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },
> +    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },
> +};
> +
> +struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },
> +    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },
> +};
> +
> +const char *tonemap_func[TONEMAP_MAX] = {
> +    [TONEMAP_NONE]     = "direct",
> +    [TONEMAP_LINEAR]   = "linear",
> +    [TONEMAP_GAMMA]    = "gamma",
> +    [TONEMAP_CLIP]     = "clip",
> +    [TONEMAP_REINHARD] = "reinhard",
> +    [TONEMAP_HABLE]    = "hable",
> +    [TONEMAP_MOBIUS]   = "mobius",
> +};
> +
> +static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out,
> +                               double rgb2rgb[3][3]) {
> +    double rgb2xyz[3][3], xyz2rgb[3][3];
> +
> +    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);
> +    invert_matrix3x3(rgb2xyz, xyz2rgb);
> +    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);
> +    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);
> +}
> +
> +#define OPENCL_SOURCE_NB 3
> +// Average light level for SDR signals. This is equal to a signal level of 0.5
> +// under a typical presentation gamma of about 2.0.
> +static const float sdr_avg = 0.25f;
> +
> +static int tonemap_opencl_init(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int rgb2rgb_passthrough = 1;
> +    double rgb2rgb[3][3];
> +    struct LumaCoefficients luma_src, luma_dst;
> +    cl_int cle;
> +    int err;
> +    AVBPrint header;
> +    const char *opencl_sources[OPENCL_SOURCE_NB];
> +
> +    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
> +
> +    switch(ctx->tonemap) {
> +    case TONEMAP_GAMMA:
> +        if (isnan(ctx->param))
> +            ctx->param = 1.8f;
> +        break;
> +    case TONEMAP_REINHARD:
> +        if (!isnan(ctx->param))
> +            ctx->param = (1.0f - ctx->param) / ctx->param;
> +        break;
> +    case TONEMAP_MOBIUS:
> +        if (isnan(ctx->param))
> +            ctx->param = 0.3f;
> +        break;
> +    }
> +
> +    if (isnan(ctx->param))
> +        ctx->param = 1.0f;
> +
> +    // SDR peak is 1.0f
> +    ctx->target_peak = 1.0f;
> +    av_log(ctx, AV_LOG_DEBUG, "tone mapping transfer from %s to %s\n",
> +           av_color_transfer_name(ctx->trc_in),
> +           av_color_transfer_name(ctx->trc_out));
> +    av_log(ctx, AV_LOG_DEBUG, "mapping colorspace from %s to %s\n",
> +           av_color_space_name(ctx->colorspace_in),
> +           av_color_space_name(ctx->colorspace_out));
> +    av_log(ctx, AV_LOG_DEBUG, "mapping primaries from %s to %s\n",
> +           av_color_primaries_name(ctx->primaries_in),
> +           av_color_primaries_name(ctx->primaries_out));
> +    av_log(ctx, AV_LOG_DEBUG, "mapping range from %s to %s\n",
> +           av_color_range_name(ctx->range_in),
> +           av_color_range_name(ctx->range_out));
> +    // checking valid value just because of limited implementaion
> +    // please remove when more functionalities are implemented
> +    av_assert0(ctx->trc_out == AVCOL_TRC_BT709 ||
> +               ctx->trc_out == AVCOL_TRC_BT2020_10);
> +    av_assert0(ctx->trc_in == AVCOL_TRC_SMPTE2084||
> +               ctx->trc_in == AVCOL_TRC_ARIB_STD_B67);
> +    av_assert0(ctx->colorspace_in == AVCOL_SPC_BT2020_NCL ||
> +               ctx->colorspace_in == AVCOL_SPC_BT709);
> +    av_assert0(ctx->primaries_in == AVCOL_PRI_BT2020 ||
> +               ctx->primaries_in == AVCOL_PRI_BT709);
> +
> +    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",
> +               ctx->param);
> +    av_bprintf(&header, "__constant const float desat_param = %.4ff;\n",
> +               ctx->desat_param);
> +    av_bprintf(&header, "__constant const float target_peak = %.4ff;\n",
> +               ctx->target_peak);
> +    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);
> +    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",
> +               ctx->scene_threshold);
> +    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]);
> +    av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES);
> +
> +    if (ctx->primaries_out != ctx->primaries_in) {
> +        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);
> +        rgb2rgb_passthrough = 0;
> +    }
> +    if (ctx->range_in == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_IN\n");
> +
> +    if (ctx->range_out == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_OUT\n");
> +
> +    if (rgb2rgb_passthrough)
> +        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");
> +    else {
> +        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",
> +                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);
> +    }
> +
> +    av_bprintf(&header, "#define rgb_matrix %s\n",
> +               rgb_coff[ctx->colorspace_in]);
> +    av_bprintf(&header, "#define yuv_matrix %s\n",
> +               yuv_coff[ctx->colorspace_out]);
> +
> +    luma_src = luma_coefficients[ctx->colorspace_in];
> +    luma_dst = luma_coefficients[ctx->colorspace_out];
> +    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_src.cr, luma_src.cg, luma_src.cb);
> +    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_dst.cr, luma_dst.cg, luma_dst.cb);
> +
> +    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define delinearize %s\n",
> +               delinearize_funcs[ctx->trc_out]);
> +
> +    if (ctx->trc_in == AVCOL_TRC_ARIB_STD_B67)
> +        av_bprintf(&header, "#define ootf_impl ootf_hlg\n");
> +
> +    if (ctx->trc_out == AVCOL_TRC_ARIB_STD_B67)
> +        av_bprintf(&header, "#define inverse_ootf_impl inverse_ootf_hlg\n");
> +
> +    av_log(avctx, AV_LOG_DEBUG, "Generated OpenCL header:\n%s\n", header.str);
> +    opencl_sources[0] = header.str;
> +    opencl_sources[1] = ff_opencl_source_tonemap;
> +    opencl_sources[2] = ff_opencl_source_colorspace_basic;
> +    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
> +
> +    av_bprint_finalize(&header, NULL);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    if (!ctx->command_queue) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);
> +    if (!ctx->kernel) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->util_mem =
> +        clCreateBuffer(ctx->ocf.hwctx->context, 0,
> +                       (2 * DETECTION_FRAMES + 7) * sizeof(unsigned),
> +                       NULL, &cle);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int tonemap_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    TonemapOpenCLContext *s = avctx->priv;
> +    int ret;
> +    if (s->format == AV_PIX_FMT_NONE)
> +        av_log(avctx, AV_LOG_WARNING, "format not set, use default format NV12\n");
> +    else {
> +      if (s->format != AV_PIX_FMT_P010 &&
> +          s->format != AV_PIX_FMT_NV12) {
> +        av_log(avctx, AV_LOG_ERROR, "unsupported output format,"
> +               "only p010/nv12 supported now\n");
> +        return AVERROR(EINVAL);
> +      }
> +    }
> +
> +    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ? AV_PIX_FMT_NV12 : s->format;
> +    ret = ff_opencl_filter_config_output(outlink);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,
> +                         AVFrame *output, AVFrame *input, float peak) {
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int err = AVERROR(ENOSYS);
> +    size_t global_work[2];
> +    size_t local_work[2];
> +    cl_int cle;
> +
> +    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "peak luma: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    local_work[0]  = 16;
> +    local_work[1]  = 16;
> +    // Note the work size based on uv plane, as we process a 2x2 quad in one workitem
> +    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
> +                                                1, 16);
> +    if (err < 0)
> +        return err;
> +
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
> +                                 global_work, local_work,
> +                                 0, NULL, NULL);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",
> +               cle);
> +        return AVERROR(EIO);
> +    }
> +    return 0;
> +}
> +
> +static double determine_signal_peak(AVFrame *in)
> +{
> +    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
> +    double peak = 0;
> +
> +    if (sd) {
> +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
> +        peak = clm->MaxCLL / REFERENCE_WHITE;
> +    }
> +
> +    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
> +    if (!peak && sd) {
> +        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
> +        if (metadata->has_luminance)
> +            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;
> +    }
> +
> +    // For untagged source, use peak of 10000 if SMPTE ST.2084
> +    // otherwise assume HLG with reference display peak 1000.
> +    if (!peak)
> +        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 10.0f;
> +
> +    return peak;
> +}
> +
> +static void update_metadata(AVFrame *in, double peak) {
> +    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
> +
> +    if (sd) {
> +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
> +        clm->MaxCLL = (unsigned)(peak * REFERENCE_WHITE);
> +    }
> +
> +    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
> +    if (sd) {
> +        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
> +        if (metadata->has_luminance)
> +            metadata->max_luminance =av_d2q(peak * REFERENCE_WHITE, 10000);
> +    }
> +}
> +
> +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    cl_int cle;
> +    int err;
> +    double peak = ctx->peak;
> +
> +    AVHWFramesContext *input_frames_ctx =
> +        (AVHWFramesContext*)input->hw_frames_ctx->data;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!peak)
> +        peak = determine_signal_peak(input);
> +
> +    if (ctx->trc != -1)
> +        output->color_trc = ctx->trc;
> +    if (ctx->primaries != -1)
> +        output->color_primaries = ctx->primaries;
> +    if (ctx->colorspace != -1)
> +        output->colorspace = ctx->colorspace;
> +    if (ctx->range != -1)
> +        output->color_range = ctx->range;
> +
> +    ctx->trc_in = input->color_trc;
> +    ctx->trc_out = output->color_trc;
> +    ctx->colorspace_in = input->colorspace;
> +    ctx->colorspace_out = output->colorspace;
> +    ctx->primaries_in = input->color_primaries;
> +    ctx->primaries_out = output->color_primaries;
> +    ctx->range_in = input->color_range;
> +    ctx->range_out = output->color_range;
> +
> +    if (!ctx->initialised) {
> +        if (!(input->color_trc == AVCOL_TRC_SMPTE2084 ||
> +            input->color_trc == AVCOL_TRC_ARIB_STD_B67)) {
> +            av_log(ctx, AV_LOG_ERROR, "unsupported transfer function characteristic.\n");
> +            err = AVERROR(ENOSYS);
> +            goto fail;
> +        }
> +
> +        if (input_frames_ctx->sw_format != AV_PIX_FMT_P010) {
> +            av_log(ctx, AV_LOG_ERROR, "unsupported format in tonemap_opencl.\n");
> +            err = AVERROR(ENOSYS);
> +            goto fail;
> +        }
> +
> +        err = tonemap_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    switch(input_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_P010:
> +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
> +        if (err < 0) goto fail;
> +        break;
> +    default:
> +        err = AVERROR(ENOSYS);
> +        goto fail;
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
> +               cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    av_frame_free(&input);
> +
> +    update_metadata(output, ctx->target_peak);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Tone-mapping output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +#ifdef DEBUG

I like the inclusion of this code, but I don't think it's a good idea to have blocks which are never built without source editing.  Put it behind an option instead?

> +    {
> +        uint32_t *ptr, *max_total_p, *avg_total_p, *frame_number_p;
> +        float peak_detected, avg_detected;
> +
> +        unsigned map_size = (2 * DETECTION_FRAMES  + 7) * sizeof(unsigned);
> +        ptr = (void *)clEnqueueMapBuffer(ctx->command_queue, ctx->util_mem,
> +                                         CL_TRUE, CL_MAP_READ, 0, map_size,
> +                                         0, NULL, NULL, &cle);
> +        // For the layout of the util buffer, refer tonemap.cl
> +        if (ptr) {
> +            max_total_p = ptr + 2 * (DETECTION_FRAMES + 1) + 1;
> +            avg_total_p = max_total_p + 1;
> +            frame_number_p = avg_total_p + 2;
> +            peak_detected = (float)*max_total_p / (REFERENCE_WHITE * (*frame_number_p));
> +            avg_detected = (float)*avg_total_p / (REFERENCE_WHITE * (*frame_number_p));
> +            av_log(ctx, AV_LOG_DEBUG, "peak %f, avg %f will be used for next frame\n",
> +                   peak_detected, avg_detected);
> +            clEnqueueUnmapMemObject(ctx->command_queue, ctx->util_mem, ptr, 0,
> +                                    NULL, NULL);
> +        }
> +    }
> +#endif
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}9
> +
> +#define OFFSET(x) offsetof(TonemapOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption tonemap_opencl_options[] = {
> +    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
> +    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
> +    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
> +    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
> +    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
> +    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
> +    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
> +    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
> +    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
> +    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },
> +    { "matrix", "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
> +    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },
> +    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE, FLAGS, "fmt" },

Just make the top limit a large number to avoid putting something weird here (e.g. libavcodec/options_table.h uses INT_MAX for this purpose).

> +    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
> +    { "param",     "tonemap parameter",   OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },

You need a configure dependency on const_nan to use NAN.

> +    { "desat",     "desaturation parameter",   OFFSET(desat_param), AV_OPT_TYPE_DOUBLE, {.dbl = 0.5}, 0, DBL_MAX, FLAGS },
> +    { "threshold", "scene detection threshold",   OFFSET(scene_threshold), AV_OPT_TYPE_DOUBLE, {.dbl = 0.2}, 0, DBL_MAX, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(tonemap_opencl);
> +
> +static const AVFilterPad tonemap_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &tonemap_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad tonemap_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &tonemap_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_tonemap_opencl = {
> +    .name           = "tonemap_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion with tonemapping"),
> +    .priv_size      = sizeof(TonemapOpenCLContext),
> +    .priv_class     = &tonemap_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &tonemap_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = tonemap_opencl_inputs,
> +    .outputs        = tonemap_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> 

Thanks,

- Mark