[FFmpeg-devel] [PATCH V5 2/2] lavfi/lut: Add slice threading support

Sun Jun 2 11:13:28 EEST 2019

On 6/1/19, Jun Zhao <mypopydev at gmail.com> wrote:
> From: Jun Zhao <barryjzhao at tencent.com>
>
> Used the command for 1080p h264 clip as follow:
>
> a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null
> b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null
>
> after enabled the slice threading, the fps change from:
>
> a). 144fps to 258fps (lutyuv)
> b). 94fps  to 153fps (lutrgb)
>
> in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz
>
> Signed-off-by: Jun Zhao <barryjzhao at tencent.com>
> ---
>  libavfilter/vf_lut.c |  310
> ++++++++++++++++++++++++++++++++------------------
>  1 files changed, 197 insertions(+), 113 deletions(-)
>
> diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
> index c815ddc..90998e6 100644
> --- a/libavfilter/vf_lut.c
> +++ b/libavfilter/vf_lut.c
> @@ -337,13 +337,194 @@ static int config_props(AVFilterLink *inlink)
>      return 0;
>  }
>
> +struct thread_data {
> +    AVFrame *in;
> +    AVFrame *out;
> +
> +    int w;
> +    int h;
> +};
> +
> +#define LOAD_PACKED_COMMON\
> +    LutContext *s = ctx->priv;\
> +    const struct thread_data *td = arg;\
> +\
> +    int i, j;\
> +    const int w = td->w;\
> +    const int h = td->h;\
> +    AVFrame *in = td->in;\
> +    AVFrame *out = td->out;\
> +    const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
> +    const int step = s->step;\
> +\
> +    const int slice_start = (h *  jobnr   ) / nb_jobs;\
> +    const int slice_end   = (h * (jobnr+1)) / nb_jobs;\
> +
> +/* packed, 16-bit */
> +static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr,
> int nb_jobs)
> +{
> +    LOAD_PACKED_COMMON
> +
> +    uint16_t *inrow, *outrow, *inrow0, *outrow0;
> +    const int in_linesize  =  in->linesize[0] / 2;
> +    const int out_linesize = out->linesize[0] / 2;
> +    inrow0  = (uint16_t *)in ->data[0];
> +    outrow0 = (uint16_t *)out->data[0];
> +
> +    for (i = slice_start; i < slice_end; i++) {
> +        inrow  = inrow0 + i * in_linesize;
> +        outrow = outrow0 + i * out_linesize;
> +        for (j = 0; j < w; j++) {
> +
> +            switch (step) {
> +#if HAVE_BIGENDIAN
> +            case 4:  outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]);
> // Fall-through
> +            case 3:  outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]);
> // Fall-through
> +            case 2:  outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]);
> // Fall-through
> +            default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
> +#else
> +            case 4:  outrow[3] = tab[3][inrow[3]]; // Fall-through
> +            case 3:  outrow[2] = tab[2][inrow[2]]; // Fall-through
> +            case 2:  outrow[1] = tab[1][inrow[1]]; // Fall-through
> +            default: outrow[0] = tab[0][inrow[0]];
> +#endif
> +            }
> +            outrow += step;
> +            inrow  += step;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/* packed, 8-bit */
> +static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int
> nb_jobs)
> +{
> +    LOAD_PACKED_COMMON
> +
> +    uint8_t *inrow, *outrow, *inrow0, *outrow0;
> +    const int in_linesize  =  in->linesize[0];
> +    const int out_linesize = out->linesize[0];
> +    inrow0  = in ->data[0];
> +    outrow0 = out->data[0];
> +
> +    for (i = slice_start; i < slice_end; i++) {
> +        inrow  = inrow0 + i * in_linesize;
> +        outrow = outrow0 + i * out_linesize;
> +        for (j = 0; j < w; j++) {
> +            switch (step) {
> +            case 4:  outrow[3] = tab[3][inrow[3]]; // Fall-through
> +            case 3:  outrow[2] = tab[2][inrow[2]]; // Fall-through
> +            case 2:  outrow[1] = tab[1][inrow[1]]; // Fall-through
> +            default: outrow[0] = tab[0][inrow[0]];
> +            }
> +            outrow += step;
> +            inrow  += step;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +#define LOAD_PLANAR_COMMON\
> +    LutContext *s = ctx->priv;\
> +    const struct thread_data *td = arg;\
> +    int i, j, plane;\
> +    AVFrame *in = td->in;\
> +    AVFrame *out = td->out;\
> +
> +#define PLANAR_COMMON\
> +        int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
> +        int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
> +        int h = AV_CEIL_RSHIFT(td->h, vsub);\
> +        int w = AV_CEIL_RSHIFT(td->w, hsub);\
> +        const uint16_t *tab = s->lut[plane];\
> +\
> +        const int slice_start = (h *  jobnr   ) / nb_jobs;\
> +        const int slice_end   = (h * (jobnr+1)) / nb_jobs;\
> +
> +/* planar >8 bit depth */
> +static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr,
> int nb_jobs)
> +{
> +    LOAD_PLANAR_COMMON
> +
> +    uint16_t *inrow, *outrow;
> +
> +    for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane];
> plane++) {
> +        PLANAR_COMMON
> +
> +        const int in_linesize  =  in->linesize[plane] / 2;
> +        const int out_linesize = out->linesize[plane] / 2;
> +
> +        inrow  = (uint16_t *)(in ->data[plane] + slice_start *
> in_linesize);
> +        outrow = (uint16_t *)(out->data[plane] + slice_start *
> out_linesize);
> +
> +        for (i = slice_start; i < slice_end; i++) {
> +            for (j = 0; j < w; j++) {
> +#if HAVE_BIGENDIAN
> +                outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
> +#else
> +                outrow[j] = tab[inrow[j]];
> +#endif
> +            }
> +            inrow  += in_linesize;
> +            outrow += out_linesize;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/* planar 8bit depth */
> +static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int
> nb_jobs)
> +{
> +    LOAD_PLANAR_COMMON
> +
> +    uint8_t *inrow, *outrow;
> +
> +    for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane];
> plane++) {
> +        PLANAR_COMMON
> +
> +        const int in_linesize  =  in->linesize[plane];
> +        const int out_linesize = out->linesize[plane];
> +
> +        inrow  = in ->data[plane] + slice_start * in_linesize;
> +        outrow = out->data[plane] + slice_start * out_linesize;
> +
> +        for (i = slice_start; i < slice_end; i++) {
> +            for (j = 0; j < w; j++)
> +                outrow[j] = tab[inrow[j]];
> +            inrow  += in_linesize;
> +            outrow += out_linesize;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +#define PACKED_THREAD_DATA\
> + struct thread_data td = {\
> +            .in  = in,\
> +            .out = out,\
> +            .w   = inlink->w,\
> +            .h   = in->height,\
> +        };\
> +
> +#define PLANAR_THREAD_DATA\
> + struct thread_data td = {\
> +            .in  = in,\
> +            .out = out,\
> +            .w   = inlink->w,\
> +            .h   = inlink->h,\
> +        };\
> +
>  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>  {
>      AVFilterContext *ctx = inlink->dst;
>      LutContext *s = ctx->priv;
>      AVFilterLink *outlink = ctx->outputs[0];
>      AVFrame *out;
> -    int i, j, plane, direct = 0;
> +    int direct = 0;
>
>      if (av_frame_is_writable(in)) {
>          direct = 1;
> @@ -359,121 +540,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> *in)
>
>      if (s->is_rgb && s->is_16bit && !s->is_planar) {
>          /* packed, 16-bit */
> -        uint16_t *inrow, *outrow, *inrow0, *outrow0;
> -        const int w = inlink->w;
> -        const int h = in->height;
> -        const uint16_t (*tab)[256*256] = (const uint16_t
> (*)[256*256])s->lut;
> -        const int in_linesize  =  in->linesize[0] / 2;
> -        const int out_linesize = out->linesize[0] / 2;
> -        const int step = s->step;
> -
> -        inrow0  = (uint16_t*) in ->data[0];
> -        outrow0 = (uint16_t*) out->data[0];
> -
> -        for (i = 0; i < h; i ++) {
> -            inrow  = inrow0;
> -            outrow = outrow0;
> -            for (j = 0; j < w; j++) {
> -
> -                switch (step) {
> -#if HAVE_BIGENDIAN
> -                case 4:  outrow[3] =
> av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
> -                case 3:  outrow[2] =
> av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
> -                case 2:  outrow[1] =
> av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
> -                default: outrow[0] =
> av_bswap16(tab[0][av_bswap16(inrow[0])]);
> -#else
> -                case 4:  outrow[3] = tab[3][inrow[3]]; // Fall-through
> -                case 3:  outrow[2] = tab[2][inrow[2]]; // Fall-through
> -                case 2:  outrow[1] = tab[1][inrow[1]]; // Fall-through
> -                default: outrow[0] = tab[0][inrow[0]];
> -#endif
> -                }
> -                outrow += step;
> -                inrow  += step;
> -            }
> -            inrow0  += in_linesize;
> -            outrow0 += out_linesize;
> -        }
> +        PACKED_THREAD_DATA
> +        ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
> +                               FFMIN(in->height,
> ff_filter_get_nb_threads(ctx)));
>      } else if (s->is_rgb && !s->is_planar) {
> -        /* packed */
> -        uint8_t *inrow, *outrow, *inrow0, *outrow0;
> -        const int w = inlink->w;
> -        const int h = in->height;
> -        const uint16_t (*tab)[256*256] = (const uint16_t
> (*)[256*256])s->lut;
> -        const int in_linesize  =  in->linesize[0];
> -        const int out_linesize = out->linesize[0];
> -        const int step = s->step;
> -
> -        inrow0  = in ->data[0];
> -        outrow0 = out->data[0];
> -
> -        for (i = 0; i < h; i ++) {
> -            inrow  = inrow0;
> -            outrow = outrow0;
> -            for (j = 0; j < w; j++) {
> -                switch (step) {
> -                case 4:  outrow[3] = tab[3][inrow[3]]; // Fall-through
> -                case 3:  outrow[2] = tab[2][inrow[2]]; // Fall-through
> -                case 2:  outrow[1] = tab[1][inrow[1]]; // Fall-through
> -                default: outrow[0] = tab[0][inrow[0]];
> -                }
> -                outrow += step;
> -                inrow  += step;
> -            }
> -            inrow0  += in_linesize;
> -            outrow0 += out_linesize;
> -        }
> +        /* packed 8 bits */
> +        PACKED_THREAD_DATA
> +        ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
> +                               FFMIN(in->height,
> ff_filter_get_nb_threads(ctx)));
>      } else if (s->is_16bit) {
> -        // planar >8 bit depth
> -        uint16_t *inrow, *outrow;
> -
> -        for (plane = 0; plane < 4 && in->data[plane] &&
> in->linesize[plane]; plane++) {
> -            int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
> -            int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
> -            int h = AV_CEIL_RSHIFT(inlink->h, vsub);
> -            int w = AV_CEIL_RSHIFT(inlink->w, hsub);
> -            const uint16_t *tab = s->lut[plane];
> -            const int in_linesize  =  in->linesize[plane] / 2;
> -            const int out_linesize = out->linesize[plane] / 2;
> -
> -            inrow  = (uint16_t *)in ->data[plane];
> -            outrow = (uint16_t *)out->data[plane];
> -
> -            for (i = 0; i < h; i++) {
> -                for (j = 0; j < w; j++) {
> -#if HAVE_BIGENDIAN
> -                    outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
> -#else
> -                    outrow[j] = tab[inrow[j]];
> -#endif
> -                }
> -                inrow  += in_linesize;
> -                outrow += out_linesize;
> -            }
> -        }
> +        /* planar >8 bit depth */
> +        PLANAR_THREAD_DATA
> +        ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
> +                               FFMIN(in->height,
> ff_filter_get_nb_threads(ctx)));
>      } else {
>          /* planar 8bit depth */
> -        uint8_t *inrow, *outrow;
> -
> -        for (plane = 0; plane < 4 && in->data[plane] &&
> in->linesize[plane]; plane++) {
> -            int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
> -            int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
> -            int h = AV_CEIL_RSHIFT(inlink->h, vsub);
> -            int w = AV_CEIL_RSHIFT(inlink->w, hsub);
> -            const uint16_t *tab = s->lut[plane];
> -            const int in_linesize  =  in->linesize[plane];
> -            const int out_linesize = out->linesize[plane];
> -
> -            inrow  = in ->data[plane];
> -            outrow = out->data[plane];
> -
> -            for (i = 0; i < h; i++) {
> -                for (j = 0; j < w; j++)
> -                    outrow[j] = tab[inrow[j]];
> -                inrow  += in_linesize;
> -                outrow += out_linesize;
> -            }
> -        }
> +        PLANAR_THREAD_DATA
> +        ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
> +                               FFMIN(in->height,
> ff_filter_get_nb_threads(ctx)));
>      }
>
>      if (!direct)
> @@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
>          .query_formats = query_formats,                                 \
>          .inputs        = inputs,                                        \
>          .outputs       = outputs,                                       \
> -        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,        \
> +        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC |
> AVFILTER_FLAG_SLICE_THREADS,        \
>      }
>
>  #if CONFIG_LUT_FILTER
> --
> 1.7.1
>

Should be ok if md5 hash does not change.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".