[FFmpeg-devel] GSoC Project Submission: SiTi Filter Update for ITU-T P.910 Compliance and HDR Support

Thu Aug 22 15:46:37 EEST 2024

On 2024-08-21 09:56, Poorva wrote:
> Hello FFmpeg Team,
> 
> I am submitting the completed work from my Google Summer of Code project,
> which focuses on updating the SiTi filter for ITU-T P.910 07/22 compliance
> and adding HDR support.
> 
> Attached is the patch for your review. I appreciate your feedback and
> suggestions.

> diff --git a/libavfilter/vf_siti.c b/libavfilter/vf_siti.c
> index 722e7cecc7..2bbcaa0afd 100644
> --- a/libavfilter/vf_siti.c
> +++ b/libavfilter/vf_siti.c
> @@ -47,6 +47,19 @@ static const int Y_FILTER[9] = {
>      -1, -2, -1
>  };
>  
> +static const float PU21_MATRICES[4][7] = {

In general, variable names are lowercase. Names with all caps will be 
used for #defines. I know the variables above are all caps, but we 
usually avoid that.

> +    // Reference: "PU21: A novel perceptually uniform encoding for adapting existing quality metrics for HDR"
> +    // Rafał K. Mantiuk and M. Azimi, Picture Coding Symposium 2021
> +    // https://github.com/gfxdisp/pu21
> +    {1.070275272f, 0.4088273932f, 0.153224308f, 0.2520326168f, 1.063512885f, 1.14115047f, 521.4527484f},  // BANDING
> +    {0.353487901f, 0.3734658629f, 8.277049286e-05f, 0.9062562627f, 0.09150303166f, 0.9099517204f, 596.3148142f},  // BANDING_GLARE
> +    {1.043882782f, 0.6459495343f, 0.3194584211f, 0.374025247f, 1.114783422f, 1.095360363f, 384.9217577f},  // PEAKS
> +    {816.885024f, 1479.463946f, 0.001253215609f, 0.9329636822f, 0.06746643971f, 1.573435413f, 419.6006374f}  // PEAKS_GLARE
> +};
> +
> +static const float PU21_MIN_VALUES[4] = {-1.5580e-07f, 5.4705e-10f, 1.3674e-07f, -9.7360e-08f};
> +static const float PU21_MAX_VALUES[4] = {520.4673f, 595.3939f, 380.9853f, 407.5066f};
> +
>  typedef struct SiTiContext {
>      const AVClass *class;
>      int pixel_depth;
> @@ -63,8 +76,45 @@ typedef struct SiTiContext {
>      float *motion_matrix;
>      int full_range;
>      int print_summary;
> +    int hdr_mode;
> +    int bit_depth;
> +    int color_range;
> +    int eotf_function;
> +    int calculation_domain;
> +    float l_max;
> +    float l_min;
> +    float gamma;
> +    int pu21_mode;

This following fields are not being used: bit_depth, color_range, l_min, 
and pu21_mode. Perhaps you meant to use them (they seem to make sense in 
a couple of places in the rest of the patch). If they're not used, they 
should be removed (from the context and from the options).

>  } SiTiContext;
>  
> +enum HdrMode {
> +    SDR,
> +    HDR10,
> +    HLG
> +};
> +
> +enum ColorRange {
> +    LIMITED,
> +    FULL
> +};

We already have AVColorRange. Perhaps that could be used instead.

> +
> +enum EotfFunction {
> +    BT1886,
> +    INV_SRGB
> +};
> +
> +enum CalculationDomain {
> +    PQ,
> +    PU21
> +};
> +
> +enum Pu21Mode {
> +    BANDING,
> +    BANDING_GLARE,
> +    PEAKS,
> +    PEAKS_GLARE
> +};
> +
>  static const enum AVPixelFormat pix_fmts[] = {
>      AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
>      AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P,
> @@ -78,6 +128,15 @@ static av_cold int init(AVFilterContext *ctx)
>      SiTiContext *s = ctx->priv;
>      s->max_si = 0;
>      s->max_ti = 0;
> +    s->hdr_mode = SDR;
> +    s->bit_depth = 8;
> +    s->color_range = LIMITED;
> +    s->eotf_function = BT1886;
> +    s->calculation_domain = PQ;
> +    s->l_max = 300.0;
> +    s->l_min = 0.1;
> +    s->gamma = 2.4;
> +    s->pu21_mode = BANDING;

Like I mentioned above, some of these fields are not being used (except 
for being set here).

>      return 0;
>  }
>  
> @@ -166,6 +225,105 @@ static uint16_t convert_full_range(int factor, uint16_t y)
>      return (full_upper * limit_y / limit_upper);
>  }
>  
> +// EOTF for BT.1886
> +static float eotf_1886(float x, float gamma, float l_min, float l_max)
> +{
> +    // Reference: ITU-R BT.1886, Annex 1
> +    // https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.1886-0-201103-I!!PDF-E.pdf
> +    const float l_max_gamma = powf(l_max, 1.0f / gamma);
> +    const float l_min_gamma = powf(l_min, 1.0f / gamma);
> +    const float l_diff_gamma = l_max_gamma - l_min_gamma;
> +    const float a = powf(l_diff_gamma, gamma);
> +    const float b = l_min_gamma / l_diff_gamma;
> +    const float adjusted_x = fmaxf(x + b, 0.0f);
> +
> +    return a * powf(adjusted_x, gamma);
> +}
> +
> +static float eotf_inv_srgb(float x)
> +{
> +    // Inverse sRGB EOTF (Electro-Optical Transfer Function) according to IEC 61966-2-1:1999
> +    // Section G.2 (Encoding transformation)
> +    // Reference: https://cdn.standards.iteh.ai/samples/10795/ae461684569b40bbbb2d9a22b1047f05/IEC-61966-2-1-1999-AMD1-2003.pdf
> +    return (x <= 0.04045f) ? x / 12.92f : powf((x + 0.055f) / 1.055f, 2.4f);
> +}
> +
> +static float apply_display_model(SiTiContext *s, float x)
> +{
> +    // Apply either BT.1886 or inverse sRGB EOTF based on the context
> +    if (s->eotf_function == BT1886) {
> +        // BT.1886 EOTF
> +        // Reference: ITU-R BT.1886-0, Annex 1
> +        // https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.1886-0-201103-I!!PDF-E.pdf
> +        return eotf_1886(x, s->gamma, 0.0f, 1.0f);
> +    } else {
> +        // Inverse sRGB EOTF
> +        return eotf_inv_srgb(x);
> +    }
> +}
> +
> +static float oetf_pq(float x)
> +{
> +    // PQ (Perceptual Quantizer) OETF according to ITU-R BT.2100-2
> +    // Reference: https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2100-2-201807-I!!PDF-E.pdf
> +    // See page 5, Table 4
> +
> +    const float m1 = 0.1593017578125f;
> +    const float m2 = 78.84375f;
> +    const float c1 = 0.8359375f;
> +    const float c2 = 18.8515625f;
> +    const float c3 = 18.6875f;
> +
> +    float y = powf(x / 10000.0f, m1);
> +    return powf((c1 + c2 * y) / (1.0f + c3 * y), m2);
> +}
> +
> +static float oetf_pu21(float x, int mode);

This forward declaration can be avoided if you move oetf_pu21_wrapper 
below oetf_pu21.

> +
> +static float oetf_pu21_wrapper(float x) {

Nit: place curly brackets for function scopes on their own lines.

> +    int mode = 0; // Set the appropriate mode or pass it as a parameter
> +    return oetf_pu21(x, mode);

Why are you hardcoding mode to 0?

> +}
> +
> +static float oetf_pu21(float x, int mode)
> +{
> +    // Reference: "PU21: A novel perceptually uniform encoding for adapting existing quality metrics for HDR"
> +    // Rafał K. Mantiuk and M. Azimi, Picture Coding Symposium 2021
> +    // https://github.com/gfxdisp/pu21
> +    // Validate the mode
> +    const float *p;
> +    float p_min;
> +    float p_max;
> +    float numerator;
> +    float denominator;
> +    float base;
> +    float exponentiated;
> +    float scaled;
> +
> +    if (mode < 0 || mode > 3) {
> +        return x;  // Invalid mode, return input unchanged
> +    }

It would be best to check the mode while the option is being set and 
bail out in the initialization if the mode is invalid, instead of 
silently misbehaving when this function is called.

> +
> +    // Get the parameters and min/max values for the given mode
> +
> +
> +    p = PU21_MATRICES[mode];
> +    p_min = PU21_MIN_VALUES[mode];
> +    p_max = PU21_MAX_VALUES[mode];
> +
> +    // Declare all other variables at the beginning of the block
> +    numerator = p[0] + p[1] * powf(x, p[3]);
> +    denominator = 1.0f + p[2] * powf(x, p[3]);
> +    base = numerator / denominator;
> +
> +    // Calculate the exponentiated and scaled value
> +    exponentiated = powf(base, p[4]);
> +    scaled = p[6] * (exponentiated - p[5]);
> +
> +    // Normalize the result to the range [0, 1]
> +    return (scaled - p_min) / (p_max - p_min);
> +}
> +
>  // Applies sobel convolution
>  static void convolve_sobel(SiTiContext *s, const uint8_t *src, float *dst, int linesize)
>  {
> @@ -186,20 +344,20 @@ static void convolve_sobel(SiTiContext *s, const uint8_t *src, float *dst, int l
>      #define CONVOLVE(bps)                                           \
>      {                                                               \
>          uint##bps##_t *vsrc = (uint##bps##_t*)src;                  \
> -        for (int j = 1; j < s->height - 1; j++) {                   \
> -            for (int i = 1; i < s->width - 1; i++) {                \
> +        for (int img_j = 1; img_j < s->height - 1; img_j++) {                   \
> +            for (int img_i = 1; img_i < s->width - 1; img_i++) {                \
>                  x_conv_sum = 0.0;                                   \
>                  y_conv_sum = 0.0;                                   \
>                  for (int k = 0; k < filter_size; k++) {             \
>                      ki = k % filter_width - 1;                      \
>                      kj = floor(k / filter_width) - 1;               \
> -                    index = (j + kj) * stride + (i + ki);           \
> +                    index = (img_j + kj) * stride + (img_i + ki);           \
>                      data = s->full_range ? vsrc[index] : convert_full_range(factor, vsrc[index]); \
>                      x_conv_sum += data * X_FILTER[k];               \
>                      y_conv_sum += data * Y_FILTER[k];               \
>                  }                                                   \
>                  gradient = sqrt(x_conv_sum * x_conv_sum + y_conv_sum * y_conv_sum); \
> -                dst[(j - 1) * (s->width - 2) + (i - 1)] = gradient; \
> +                dst[(img_j - 1) * (s->width - 2) + (img_i - 1)] = gradient; \
>              }                                                       \
>          }                                                           \
>      }

This chunk just renames some variables, which doesn't seem necessary. It 
also seems unrelated to the main patch. When renaming like this is 
indeed necessary, it's a cosmetic change and it should be in a separate 
patch.

Also, pay attention to keeping the alignment of the backslashes at the 
end of the line when changing code inside macros.

> @@ -254,15 +412,16 @@ static float std_deviation(float *img_metrics, int width, int height)
>      int size = height * width;
>      double mean = 0.0;
>      double sqr_diff = 0;
> +    int j, i;
>  
> -    for (int j = 0; j < height; j++)
> -        for (int i = 0; i < width; i++)
> +    for (j = 0; j < height; j++)
> +        for (i = 0; i < width; i++)
>              mean += img_metrics[j * width + i];
>  
>      mean /= size;
>  
> -    for (int j = 0; j < height; j++) {
> -        for (int i = 0; i < width; i++) {
> +    for (j = 0; j < height; j++) {
> +        for (i = 0; i < width; i++) {
>              float mean_diff = img_metrics[j * width + i] - mean;
>              sqr_diff += (mean_diff * mean_diff);
>          }

Again, this chunk is just cosmetics. If it's unnecessary, it should be 
dropped. If it's unrelated, it should be split into its own patch.

> @@ -285,15 +444,46 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
>      float si;
>      float ti;
>  
> -    s->full_range = is_full_range(frame);
> +    s->full_range = is_full_range(frame); // Determine if full range is needed
>      s->nb_frames++;
>  
> +    // Apply EOTF and OETF transformations
> +    for (int idx = 0; idx < s->width * s->height; idx++) {

Nit++: 'i' is shorter than 'idx' :)

> +        float pixel = ((uint8_t*)frame->data[0])[idx] / 255.0f;
> +        float eotf = apply_display_model(s, pixel);
> +        float oetf = oetf_pq(eotf * s->l_max);
> +        ((uint8_t*)frame->data[0])[idx] = (uint8_t)(oetf * 255.0f);
> +    }
> +
>      // Calculate si and ti
>      convolve_sobel(s, frame->data[0], s->gradient_matrix, frame->linesize[0]);
>      calculate_motion(s, frame->data[0], s->motion_matrix, frame->linesize[0]);
>      si = std_deviation(s->gradient_matrix, s->width - 2, s->height - 2);
>      ti = std_deviation(s->motion_matrix, s->width, s->height);
>  
> +    // Apply HDR transformations if necessary
> +    if (s->hdr_mode != SDR) {
> +        float (*oetf_func)(float);
> +        if (s->calculation_domain == PQ) {
> +            oetf_func = oetf_pq;
> +        } else {
> +            oetf_func = oetf_pu21_wrapper;
> +        }
> +
> +        for (int i = 0; i < (s->width - 2) * (s->height - 2); i++) {
> +            s->gradient_matrix[i] = oetf_func(apply_display_model(s, s->gradient_matrix[i] / 255.0f)) * 255.0f;
> +        }
> +        for (int i = 0; i < s->width * s->height; i++) {
> +            s->motion_matrix[i] = oetf_func(apply_display_model(s, s->motion_matrix[i] / 255.0f)) * 255.0f;
> +        }
> +
> +        si = std_deviation(s->gradient_matrix, s->width - 2, s->height - 2);
> +        ti = std_deviation(s->motion_matrix, s->width, s->height);
> +    }
> +
> +    // Print SI and TI values for each frame
> +    av_log(ctx, AV_LOG_INFO, "Frame %"PRId64" - SI: %.6f, TI: %.6f\n", s->nb_frames, si, ti);

Is this behaviour expected? To always print the SI and TI values for 
each frame? Perhaps it should be an option (similar to "print_summary").

> +
>      // Calculate statistics
>      s->max_si  = fmaxf(si, s->max_si);
>      s->max_ti  = fmaxf(ti, s->max_ti);
> @@ -314,6 +504,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
>  
>  static const AVOption siti_options[] = {
>      { "print_summary", "Print summary showing average values", OFFSET(print_summary), AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS },
> +    { "hdr_mode", "HDR mode (0: SDR, 1: HDR10, 2: HLG)", OFFSET(hdr_mode), AV_OPT_TYPE_INT, {.i64=0}, 0, 2, FLAGS },
> +    { "bit_depth", "Bit depth (8, 10, or 12)", OFFSET(bit_depth), AV_OPT_TYPE_INT, {.i64=8}, 8, 12, FLAGS },
> +    { "color_range", "Color range (0: limited, 1: full)", OFFSET(color_range), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
> +    { "eotf_function", "EOTF function (0: BT.1886, 1: Inverse sRGB)", OFFSET(eotf_function), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
> +    { "calculation_domain", "Calculation domain (0: PQ, 1: PU21)", OFFSET(calculation_domain), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },

Instead of writing the possible values as numbers in the help string, 
declare a list of constants that are accepted. Look at the "dct" option 
and its AV_OPT_TYPE_CONST entries in libavcodec/options_table.h for an 
example.

The same could be done for the other options you added.

> +    { "l_max", "Maximum luminance", OFFSET(l_max), AV_OPT_TYPE_FLOAT, {.dbl=300.0}, 0, 10000, FLAGS },
> +    { "l_min", "Minimum luminance", OFFSET(l_min), AV_OPT_TYPE_FLOAT, {.dbl=0.1}, 0, 1, FLAGS },
> +    { "gamma", "Gamma value for BT.1886", OFFSET(gamma), AV_OPT_TYPE_FLOAT, {.dbl=2.4}, 1, 3, FLAGS },
> +    { "pu21_mode", "PU21 mode (0: BANDING, 1: BANDING_GLARE, 2: PEAKS, 3: PEAKS_GLARE)", OFFSET(pu21_mode), AV_OPT_TYPE_INT, {.i64=0}, 0, 3, FLAGS },
>      { NULL }
>  };
>