[FFmpeg-cvslog] lavfi/lut: Add slice threading support
Jun Zhao
git at videolan.org
Mon Jun 3 04:43:32 EEST 2019
ffmpeg | branch: master | Jun Zhao <barryjzhao at tencent.com> | Tue May 21 23:39:09 2019 +0800| [bbad0bc5ffcfd37df2df5aec6541fa766323d6bf] | committer: Jun Zhao
lavfi/lut: Add slice threading support
Used the command for 1080p h264 clip as follow:
a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null
b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null
after enabled the slice threading, the fps change from:
a). 144fps to 258fps (lutyuv)
b). 94fps to 153fps (lutrgb)
in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz
Reviewed-by: Paul B Mahol <onemda at gmail.com>
Signed-off-by: Jun Zhao <barryjzhao at tencent.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bbad0bc5ffcfd37df2df5aec6541fa766323d6bf
---
libavfilter/vf_lut.c | 310 ++++++++++++++++++++++++++++++++-------------------
1 file changed, 197 insertions(+), 113 deletions(-)
diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
index c815ddc194..90998e655f 100644
--- a/libavfilter/vf_lut.c
+++ b/libavfilter/vf_lut.c
@@ -337,13 +337,194 @@ static int config_props(AVFilterLink *inlink)
return 0;
}
+struct thread_data {
+ AVFrame *in;
+ AVFrame *out;
+
+ int w;
+ int h;
+};
+
+#define LOAD_PACKED_COMMON\
+ LutContext *s = ctx->priv;\
+ const struct thread_data *td = arg;\
+\
+ int i, j;\
+ const int w = td->w;\
+ const int h = td->h;\
+ AVFrame *in = td->in;\
+ AVFrame *out = td->out;\
+ const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
+ const int step = s->step;\
+\
+ const int slice_start = (h * jobnr ) / nb_jobs;\
+ const int slice_end = (h * (jobnr+1)) / nb_jobs;\
+
+/* packed, 16-bit */
+static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+ LOAD_PACKED_COMMON
+
+ uint16_t *inrow, *outrow, *inrow0, *outrow0;
+ const int in_linesize = in->linesize[0] / 2;
+ const int out_linesize = out->linesize[0] / 2;
+ inrow0 = (uint16_t *)in ->data[0];
+ outrow0 = (uint16_t *)out->data[0];
+
+ for (i = slice_start; i < slice_end; i++) {
+ inrow = inrow0 + i * in_linesize;
+ outrow = outrow0 + i * out_linesize;
+ for (j = 0; j < w; j++) {
+
+ switch (step) {
+#if HAVE_BIGENDIAN
+ case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
+ case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
+ case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
+ default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
+#else
+ case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
+ case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
+ case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
+ default: outrow[0] = tab[0][inrow[0]];
+#endif
+ }
+ outrow += step;
+ inrow += step;
+ }
+ }
+
+ return 0;
+}
+
+/* packed, 8-bit */
+static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+ LOAD_PACKED_COMMON
+
+ uint8_t *inrow, *outrow, *inrow0, *outrow0;
+ const int in_linesize = in->linesize[0];
+ const int out_linesize = out->linesize[0];
+ inrow0 = in ->data[0];
+ outrow0 = out->data[0];
+
+ for (i = slice_start; i < slice_end; i++) {
+ inrow = inrow0 + i * in_linesize;
+ outrow = outrow0 + i * out_linesize;
+ for (j = 0; j < w; j++) {
+ switch (step) {
+ case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
+ case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
+ case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
+ default: outrow[0] = tab[0][inrow[0]];
+ }
+ outrow += step;
+ inrow += step;
+ }
+ }
+
+ return 0;
+}
+
+#define LOAD_PLANAR_COMMON\
+ LutContext *s = ctx->priv;\
+ const struct thread_data *td = arg;\
+ int i, j, plane;\
+ AVFrame *in = td->in;\
+ AVFrame *out = td->out;\
+
+#define PLANAR_COMMON\
+ int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
+ int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
+ int h = AV_CEIL_RSHIFT(td->h, vsub);\
+ int w = AV_CEIL_RSHIFT(td->w, hsub);\
+ const uint16_t *tab = s->lut[plane];\
+\
+ const int slice_start = (h * jobnr ) / nb_jobs;\
+ const int slice_end = (h * (jobnr+1)) / nb_jobs;\
+
+/* planar >8 bit depth */
+static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+ LOAD_PLANAR_COMMON
+
+ uint16_t *inrow, *outrow;
+
+ for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
+ PLANAR_COMMON
+
+ const int in_linesize = in->linesize[plane] / 2;
+ const int out_linesize = out->linesize[plane] / 2;
+
+ inrow = (uint16_t *)(in ->data[plane] + slice_start * in_linesize);
+ outrow = (uint16_t *)(out->data[plane] + slice_start * out_linesize);
+
+ for (i = slice_start; i < slice_end; i++) {
+ for (j = 0; j < w; j++) {
+#if HAVE_BIGENDIAN
+ outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
+#else
+ outrow[j] = tab[inrow[j]];
+#endif
+ }
+ inrow += in_linesize;
+ outrow += out_linesize;
+ }
+ }
+
+ return 0;
+}
+
+/* planar 8bit depth */
+static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+ LOAD_PLANAR_COMMON
+
+ uint8_t *inrow, *outrow;
+
+ for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
+ PLANAR_COMMON
+
+ const int in_linesize = in->linesize[plane];
+ const int out_linesize = out->linesize[plane];
+
+ inrow = in ->data[plane] + slice_start * in_linesize;
+ outrow = out->data[plane] + slice_start * out_linesize;
+
+ for (i = slice_start; i < slice_end; i++) {
+ for (j = 0; j < w; j++)
+ outrow[j] = tab[inrow[j]];
+ inrow += in_linesize;
+ outrow += out_linesize;
+ }
+ }
+
+ return 0;
+}
+
+#define PACKED_THREAD_DATA\
+ struct thread_data td = {\
+ .in = in,\
+ .out = out,\
+ .w = inlink->w,\
+ .h = in->height,\
+ };\
+
+#define PLANAR_THREAD_DATA\
+ struct thread_data td = {\
+ .in = in,\
+ .out = out,\
+ .w = inlink->w,\
+ .h = inlink->h,\
+ };\
+
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
AVFilterContext *ctx = inlink->dst;
LutContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0];
AVFrame *out;
- int i, j, plane, direct = 0;
+ int direct = 0;
if (av_frame_is_writable(in)) {
direct = 1;
@@ -359,121 +540,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
if (s->is_rgb && s->is_16bit && !s->is_planar) {
/* packed, 16-bit */
- uint16_t *inrow, *outrow, *inrow0, *outrow0;
- const int w = inlink->w;
- const int h = in->height;
- const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
- const int in_linesize = in->linesize[0] / 2;
- const int out_linesize = out->linesize[0] / 2;
- const int step = s->step;
-
- inrow0 = (uint16_t*) in ->data[0];
- outrow0 = (uint16_t*) out->data[0];
-
- for (i = 0; i < h; i ++) {
- inrow = inrow0;
- outrow = outrow0;
- for (j = 0; j < w; j++) {
-
- switch (step) {
-#if HAVE_BIGENDIAN
- case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
- case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
- case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
- default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
-#else
- case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
- case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
- case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
- default: outrow[0] = tab[0][inrow[0]];
-#endif
- }
- outrow += step;
- inrow += step;
- }
- inrow0 += in_linesize;
- outrow0 += out_linesize;
- }
+ PACKED_THREAD_DATA
+ ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
+ FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_rgb && !s->is_planar) {
- /* packed */
- uint8_t *inrow, *outrow, *inrow0, *outrow0;
- const int w = inlink->w;
- const int h = in->height;
- const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
- const int in_linesize = in->linesize[0];
- const int out_linesize = out->linesize[0];
- const int step = s->step;
-
- inrow0 = in ->data[0];
- outrow0 = out->data[0];
-
- for (i = 0; i < h; i ++) {
- inrow = inrow0;
- outrow = outrow0;
- for (j = 0; j < w; j++) {
- switch (step) {
- case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
- case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
- case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
- default: outrow[0] = tab[0][inrow[0]];
- }
- outrow += step;
- inrow += step;
- }
- inrow0 += in_linesize;
- outrow0 += out_linesize;
- }
+ /* packed 8 bits */
+ PACKED_THREAD_DATA
+ ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
+ FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_16bit) {
- // planar >8 bit depth
- uint16_t *inrow, *outrow;
-
- for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
- int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
- int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
- int h = AV_CEIL_RSHIFT(inlink->h, vsub);
- int w = AV_CEIL_RSHIFT(inlink->w, hsub);
- const uint16_t *tab = s->lut[plane];
- const int in_linesize = in->linesize[plane] / 2;
- const int out_linesize = out->linesize[plane] / 2;
-
- inrow = (uint16_t *)in ->data[plane];
- outrow = (uint16_t *)out->data[plane];
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
-#if HAVE_BIGENDIAN
- outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
-#else
- outrow[j] = tab[inrow[j]];
-#endif
- }
- inrow += in_linesize;
- outrow += out_linesize;
- }
- }
+ /* planar >8 bit depth */
+ PLANAR_THREAD_DATA
+ ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
+ FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else {
/* planar 8bit depth */
- uint8_t *inrow, *outrow;
-
- for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
- int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
- int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
- int h = AV_CEIL_RSHIFT(inlink->h, vsub);
- int w = AV_CEIL_RSHIFT(inlink->w, hsub);
- const uint16_t *tab = s->lut[plane];
- const int in_linesize = in->linesize[plane];
- const int out_linesize = out->linesize[plane];
-
- inrow = in ->data[plane];
- outrow = out->data[plane];
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++)
- outrow[j] = tab[inrow[j]];
- inrow += in_linesize;
- outrow += out_linesize;
- }
- }
+ PLANAR_THREAD_DATA
+ ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
+ FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
}
if (!direct)
@@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
.query_formats = query_formats, \
.inputs = inputs, \
.outputs = outputs, \
- .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, \
+ .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, \
}
#if CONFIG_LUT_FILTER
More information about the ffmpeg-cvslog
mailing list