[FFmpeg-devel] patch2 Add-multithreading-for-swscale-filter.patch

Pawlowski, Slawomir slawomir.pawlowski at intel.com
Fri Jul 19 15:36:48 EEST 2019


>From 3ce39207e95eb4697abb0fbaccd37cc451559e49 Mon Sep 17 00:00:00 2001
From: Slawomir Pawlowski <slawomir.pawlowski at intel.com>
Date: Fri, 19 Jul 2019 13:16:16 +0200
Subject: [PATCH] Add multithreading for swscale filter.

Use with option "-filter_scale_threads <num_threads>"
Split slice in scaler in to parts.

Signed-off-by: Slawomir Pawlowski <slawomir.pawlowski at intel.com>
Signed-off-by: Tomasz Szumski <tomasz.szumski at intel.com>
---
 fftools/ffmpeg.h              |   1 +
 fftools/ffmpeg_filter.c       |   3 +
 fftools/ffmpeg_opt.c          |   7 +
 libavfilter/avfilter.h        |  18 ++
 libavfilter/avfiltergraph.c   |   4 +
 libavfilter/vf_scale.c        |   4 +
 libswscale/options.c          |   3 +
 libswscale/slice.c            |  32 +++-
 libswscale/swscale.c          | 414 +++++++++++++++++++++++++++++-------------
 libswscale/swscale_internal.h |  46 +++++
 libswscale/utils.c            | 152 +++++++++++++++-
 11 files changed, 547 insertions(+), 137 deletions(-)

diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index eb1eaf6..ea1cef5 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -609,6 +609,7 @@ extern char *videotoolbox_pixfmt;
 
 extern int filter_nbthreads;
 extern int filter_complex_nbthreads;
+extern int filter_scale_nbthreads;
 extern int vstats_version;
 
 extern const AVIOInterruptCB int_cb;
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 6518d50..793d3e9 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -1011,6 +1011,9 @@ int configure_filtergraph(FilterGraph *fg)
         AVDictionaryEntry *e = NULL;
 
         fg->graph->nb_threads = filter_nbthreads;
+#if HAVE_THREADS
+        fg->graph->sws_nbthreads = filter_scale_nbthreads;
+#endif
 
         args[0] = 0;
         while ((e = av_dict_get(ost->sws_dict, "", e,
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index d4851a2..37791ff 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -109,6 +109,9 @@ int frame_bits_per_raw_sample = 0;
 float max_error_rate  = 2.0/3;
 int filter_nbthreads = 0;
 int filter_complex_nbthreads = 0;
+#if HAVE_THREADS
+int filter_scale_nbthreads      = 0;
+#endif
 int vstats_version = 2;
 
 
@@ -3497,6 +3500,10 @@ const OptionDef options[] = {
     { "disposition",    OPT_STRING | HAS_ARG | OPT_SPEC |
                         OPT_OUTPUT,                                  { .off = OFFSET(disposition) },
         "disposition", "" },
+#if HAVE_THREADS
+    { "filter_scale_threads",  HAS_ARG | OPT_INT,                          { &filter_scale_nbthreads },
+        "number of threads for scale filter" },
+#endif
     { "thread_queue_size", HAS_ARG | OPT_INT | OPT_OFFSET | OPT_EXPERT | OPT_INPUT,
                                                                      { .off = OFFSET(thread_queue_size) },
         "set the maximum number of queued packets from the demuxer" },
diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h
index 9d70e71..a2835d7 100644
--- a/libavfilter/avfilter.h
+++ b/libavfilter/avfilter.h
@@ -422,6 +422,16 @@ struct AVFilterContext {
      * configured.
      */
     int extra_hw_frames;
+
+
+#if HAVE_THREADS
+    /**
+     * Number of threads to processing scale
+     */
+    int sws_slice_nbthreads;
+
+#endif
+
 };
 
 /**
@@ -907,6 +917,14 @@ typedef struct AVFilterGraph {
     int sink_links_count;
 
     unsigned disable_auto_convert;
+
+#if HAVE_THREADS
+    /**
+     * Number of threads to processing scale
+     */
+    int sws_nbthreads;
+#endif
+
 } AVFilterGraph;
 
 /**
diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
index a149f8f..cbd6ad1 100644
--- a/libavfilter/avfiltergraph.c
+++ b/libavfilter/avfiltergraph.c
@@ -257,6 +257,10 @@ static int graph_config_links(AVFilterGraph *graph, AVClass *log_ctx)
     for (i = 0; i < graph->nb_filters; i++) {
         filt = graph->filters[i];
 
+#if HAVE_THREADS
+        filt->sws_slice_nbthreads = graph->sws_nbthreads;
+#endif
+
         if (!filt->nb_outputs) {
             if ((ret = avfilter_config_links(filt)))
                 return ret;
diff --git a/libavfilter/vf_scale.c b/libavfilter/vf_scale.c
index f741419..5098aee 100644
--- a/libavfilter/vf_scale.c
+++ b/libavfilter/vf_scale.c
@@ -299,6 +299,10 @@ static int config_props(AVFilterLink *outlink)
             av_opt_set_int(*s, "sws_flags", scale->flags, 0);
             av_opt_set_int(*s, "param0", scale->param[0], 0);
             av_opt_set_int(*s, "param1", scale->param[1], 0);
+#if HAVE_THREADS
+            av_opt_set_int(*s, "sw_nbthreads", ctx->sws_slice_nbthreads, 0);
+#endif
+
             if (scale->in_range != AVCOL_RANGE_UNSPECIFIED)
                 av_opt_set_int(*s, "src_range",
                                scale->in_range == AVCOL_RANGE_JPEG, 0);
diff --git a/libswscale/options.c b/libswscale/options.c
index 7eb2752..942c12d 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -80,6 +80,9 @@ static const AVOption swscale_options[] = {
     { "none",            "ignore alpha",                  0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ALPHA_BLEND_NONE}, INT_MIN, INT_MAX,       VE, "alphablend" },
     { "uniform_color",   "blend onto a uniform color",    0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX,     VE, "alphablend" },
     { "checkerboard",    "blend onto a checkerboard",     0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX,     VE, "alphablend" },
+#if HAVE_THREADS
+    { "sw_nbthreads",    "Threads number for scaling",     OFFSET(sw_nbthreads),      AV_OPT_TYPE_INT,    { .i64 = 0                 }, 0,       128,        VE },
+#endif
 
     { NULL }
 };
diff --git a/libswscale/slice.c b/libswscale/slice.c
index db4fa87..fd0c999 100644
--- a/libswscale/slice.c
+++ b/libswscale/slice.c
@@ -288,8 +288,13 @@ int ff_init_filters(SwsContext * c)
     c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
 
 
-    res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
-    if (res < 0) goto cleanup;
+    if(!c->parent) {
+        res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
+        if (res < 0) goto cleanup;
+    }
+    else {
+        memcpy(&c->slice[0],&c->parent->slice[0],sizeof(SwsSlice));
+    }
     for (i = 1; i < c->numSlice-2; ++i) {
         res = alloc_slice(&c->slice[i], c->srcFormat, lumBufSize, chrBufSize, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
         if (res < 0) goto cleanup;
@@ -306,8 +311,13 @@ int ff_init_filters(SwsContext * c)
 
     // vertical scaler output
     ++i;
-    res = alloc_slice(&c->slice[i], c->dstFormat, c->dstH, c->chrDstH, c->chrDstHSubSample, c->chrDstVSubSample, 0);
-    if (res < 0) goto cleanup;
+    if(!c->parent) {
+        res = alloc_slice(&c->slice[i], c->dstFormat, c->dstH, c->chrDstH, c->chrDstHSubSample, c->chrDstVSubSample, 0);
+        if (res < 0) goto cleanup;
+    }
+    else {
+        memcpy(&c->slice[i],&c->parent->slice[i],sizeof(SwsSlice));
+    }
 
     index = 0;
     srcIdx = 0;
@@ -320,6 +330,10 @@ int ff_init_filters(SwsContext * c)
     }
 
     if (need_lum_conv) {
+#if HAVE_THREADS
+    /* Not support Multitreading for lumia convert */
+    c->sw_nbthreads = 0;
+#endif
         res = ff_init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
         if (res < 0) goto cleanup;
         c->desc[index].alpha = c->needAlpha;
@@ -384,8 +398,14 @@ int ff_free_filters(SwsContext *c)
     }
 
     if (c->slice) {
-        for (i = 0; i < c->numSlice; ++i)
-            free_slice(&c->slice[i]);
+        if(c->parent) {
+            for (i = 1; i < c->numSlice-1; ++i)
+                free_slice(&c->slice[i]);
+        }
+        else {
+            for (i = 0; i < c->numSlice; ++i)
+                free_slice(&c->slice[i]);
+        }
         av_freep(&c->slice);
     }
     return 0;
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 4069550..ffd15c7 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -22,6 +22,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
+#include <pthread.h>
 
 #include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
@@ -234,151 +235,51 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
     if (DEBUG_SWSCALE_BUFFERS)                  \
         av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
-static int swscale(SwsContext *c, const uint8_t *src[],
-                   int srcStride[], int srcSliceY,
-                   int srcSliceH, uint8_t *dst[], int dstStride[])
+
+static void swscale_step(SwsContext *c)
 {
-    /* load a few things into local vars to make the code more readable?
-     * and faster */
-    const int dstW                   = c->dstW;
-    const int dstH                   = c->dstH;
+    SwsContextStep *step = &c->step_param;
+    int dstY= step->dstY;
+    int dstHend= step->dstHend;
+    int dstH= step->dstH;
+    int srcSliceY= step->srcSliceY;
+    int srcSliceH= step->srcSliceH;
 
-    const enum AVPixelFormat dstFormat = c->dstFormat;
-    const int flags                  = c->flags;
-    int32_t *vLumFilterPos           = c->vLumFilterPos;
-    int32_t *vChrFilterPos           = c->vChrFilterPos;
+    const int32_t *vLumFilterPos     = c->vLumFilterPos;
+    const int32_t *vChrFilterPos     = c->vChrFilterPos;
 
     const int vLumFilterSize         = c->vLumFilterSize;
     const int vChrFilterSize         = c->vChrFilterSize;
 
-    yuv2planar1_fn yuv2plane1        = c->yuv2plane1;
-    yuv2planarX_fn yuv2planeX        = c->yuv2planeX;
-    yuv2interleavedX_fn yuv2nv12cX   = c->yuv2nv12cX;
-    yuv2packed1_fn yuv2packed1       = c->yuv2packed1;
-    yuv2packed2_fn yuv2packed2       = c->yuv2packed2;
-    yuv2packedX_fn yuv2packedX       = c->yuv2packedX;
-    yuv2anyX_fn yuv2anyX             = c->yuv2anyX;
-    const int chrSrcSliceY           =                srcSliceY >> c->chrSrcVSubSample;
+    const int chrSrcSliceY           = srcSliceY >> c->chrSrcVSubSample;
     const int chrSrcSliceH           = AV_CEIL_RSHIFT(srcSliceH,   c->chrSrcVSubSample);
-    int should_dither                = isNBPS(c->srcFormat) ||
+    const int should_dither          = isNBPS(c->srcFormat) ||
                                        is16BPS(c->srcFormat);
-    int lastDstY;
 
     /* vars which will change and which we need to store back in the context */
-    int dstY         = c->dstY;
     int lumBufIndex  = c->lumBufIndex;
     int chrBufIndex  = c->chrBufIndex;
     int lastInLumBuf = c->lastInLumBuf;
     int lastInChrBuf = c->lastInChrBuf;
 
-
-    int lumStart = 0;
-    int lumEnd = c->descIndex[0];
-    int chrStart = lumEnd;
-    int chrEnd = c->descIndex[1];
-    int vStart = chrEnd;
-    int vEnd = c->numDesc;
-    SwsSlice *src_slice = &c->slice[lumStart];
+    const int lumStart = 0;
+    const int lumEnd = c->descIndex[0];
+    const int chrStart = lumEnd;
+    const int chrEnd = c->descIndex[1];
+    const int vStart = chrEnd;
+    const int vEnd = c->numDesc;
     SwsSlice *hout_slice = &c->slice[c->numSlice-2];
-    SwsSlice *vout_slice = &c->slice[c->numSlice-1];
     SwsFilterDescriptor *desc = c->desc;
 
-
-    int needAlpha = c->needAlpha;
-
     int hasLumHoles = 1;
     int hasChrHoles = 1;
 
+    int refreshBuff = 1;
 
-    if (isPacked(c->srcFormat)) {
-        src[0] =
-        src[1] =
-        src[2] =
-        src[3] = src[0];
-        srcStride[0] =
-        srcStride[1] =
-        srcStride[2] =
-        srcStride[3] = srcStride[0];
-    }
-    srcStride[1] <<= c->vChrDrop;
-    srcStride[2] <<= c->vChrDrop;
-
-    DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
-                  src[0], srcStride[0], src[1], srcStride[1],
-                  src[2], srcStride[2], src[3], srcStride[3],
-                  dst[0], dstStride[0], dst[1], dstStride[1],
-                  dst[2], dstStride[2], dst[3], dstStride[3]);
-    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
-                  srcSliceY, srcSliceH, dstY, dstH);
-    DEBUG_BUFFERS("vLumFilterSize: %d vChrFilterSize: %d\n",
-                  vLumFilterSize, vChrFilterSize);
-
-    if (dstStride[0]&15 || dstStride[1]&15 ||
-        dstStride[2]&15 || dstStride[3]&15) {
-        static int warnedAlready = 0; // FIXME maybe move this into the context
-        if (flags & SWS_PRINT_INFO && !warnedAlready) {
-            av_log(c, AV_LOG_WARNING,
-                   "Warning: dstStride is not aligned!\n"
-                   "         ->cannot do aligned memory accesses anymore\n");
-            warnedAlready = 1;
-        }
-    }
-
-    if (   (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
-        || (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
-        || dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
-        || srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
-    ) {
-        static int warnedAlready=0;
-        int cpu_flags = av_get_cpu_flags();
-        if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
-            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speed loss\n");
-            warnedAlready=1;
-        }
-    }
-
-    /* Note the user might start scaling the picture in the middle so this
-     * will not get executed. This is not really intended but works
-     * currently, so people might do it. */
-    if (srcSliceY == 0) {
-        lumBufIndex  = -1;
-        chrBufIndex  = -1;
-        dstY         = 0;
-        lastInLumBuf = -1;
-        lastInChrBuf = -1;
-    }
-
-    if (!should_dither) {
-        c->chrDither8 = c->lumDither8 = sws_pb_64;
-    }
-    lastDstY = dstY;
+    for (; dstY < dstHend; dstY++) {
 
-    ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
-                   yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);
-
-    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
-            srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
-
-    ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
-            dstY, dstH, dstY >> c->chrDstVSubSample,
-            AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
-    if (srcSliceY == 0) {
-        hout_slice->plane[0].sliceY = lastInLumBuf + 1;
-        hout_slice->plane[1].sliceY = lastInChrBuf + 1;
-        hout_slice->plane[2].sliceY = lastInChrBuf + 1;
-        hout_slice->plane[3].sliceY = lastInLumBuf + 1;
-
-        hout_slice->plane[0].sliceH =
-        hout_slice->plane[1].sliceH =
-        hout_slice->plane[2].sliceH =
-        hout_slice->plane[3].sliceH = 0;
-        hout_slice->width = dstW;
-    }
-
-    for (; dstY < dstH; dstY++) {
         const int chrDstY = dstY >> c->chrDstVSubSample;
         int use_mmx_vfilter= c->use_mmx_vfilter;
-
         // First line needed as input
         const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
         const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
@@ -395,9 +296,10 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         int posY, cPosY, firstPosY, lastPosY, firstCPosY, lastCPosY;
 
         // handle holes (FAST_BILINEAR & weird filters)
-        if (firstLumSrcY > lastInLumBuf) {
+        if (refreshBuff || firstLumSrcY > lastInLumBuf) {
 
             hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
+
             if (hasLumHoles) {
                 hout_slice->plane[0].sliceY = firstLumSrcY;
                 hout_slice->plane[3].sliceY = firstLumSrcY;
@@ -407,9 +309,10 @@ static int swscale(SwsContext *c, const uint8_t *src[],
 
             lastInLumBuf = firstLumSrcY - 1;
         }
-        if (firstChrSrcY > lastInChrBuf) {
+        if (refreshBuff || firstChrSrcY > lastInChrBuf) {
 
             hasChrHoles = lastInChrBuf != firstChrSrcY - 1;
+
             if (hasChrHoles) {
                 hout_slice->plane[1].sliceY = firstChrSrcY;
                 hout_slice->plane[2].sliceY = firstChrSrcY;
@@ -420,6 +323,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             lastInChrBuf = firstChrSrcY - 1;
         }
 
+        refreshBuff = 0;
+
         DEBUG_BUFFERS("dstY: %d\n", dstY);
         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
                       firstLumSrcY, lastLumSrcY, lastInLumBuf);
@@ -440,8 +345,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         av_assert0((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->plane[0].available_lines);
         av_assert0((lastChrSrcY - firstChrSrcY + 1) <= hout_slice->plane[1].available_lines);
 
-
         posY = hout_slice->plane[0].sliceY + hout_slice->plane[0].sliceH;
+
         if (posY <= lastLumSrcY && !hasLumHoles) {
             firstPosY = FFMAX(firstLumSrcY, posY);
             lastPosY = FFMIN(firstLumSrcY + hout_slice->plane[0].available_lines - 1, srcSliceY + srcSliceH - 1);
@@ -496,11 +401,21 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         if (dstY >= dstH - 2) {
             /* hmm looks like we can't use MMX here without overwriting
              * this array's tail */
+
+            yuv2planar1_fn yuv2plane1        = c->yuv2plane1;
+            yuv2planarX_fn yuv2planeX        = c->yuv2planeX;
+            yuv2interleavedX_fn yuv2nv12cX   = c->yuv2nv12cX;
+            yuv2packed1_fn yuv2packed1       = c->yuv2packed1;
+            yuv2packed2_fn yuv2packed2       = c->yuv2packed2;
+            yuv2packedX_fn yuv2packedX       = c->yuv2packedX;
+            yuv2anyX_fn yuv2anyX             = c->yuv2anyX;
+
             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
                                      &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
             use_mmx_vfilter= 0;
             ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
                            yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
+
         }
 
         {
@@ -508,6 +423,252 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                 desc[i].process(c, &desc[i], dstY, 1);
         }
     }
+
+    /* store changed local vars back in the context */
+    c->dstY         = dstY;
+    c->lumBufIndex  = lumBufIndex;
+    c->chrBufIndex  = chrBufIndex;
+    c->lastInLumBuf = lastInLumBuf;
+    c->lastInChrBuf = lastInChrBuf;
+}
+
+#if HAVE_THREADS
+static int swscale_threads_prepare(SwsContext *c)
+{
+    int i;
+
+    if (c->is_threads_prepared) {
+        return 0;
+    }
+    c->is_threads_prepared = 1;
+
+    if (!c->threads_ctx) return 0;
+
+    for (i = 0; i < c->sw_nbthreads ; ++i) {
+        struct SwsContextThread *ctx = &c->threads_ctx[i];
+
+        memcpy(ctx->func_ctx, c ,sizeof(SwsContext));
+        ctx->func_ctx->parent = c;
+        ff_init_filters(ctx->func_ctx);
+        ctx->func_pfn = swscale_step;
+    }
+
+    return 0;
+}
+#endif
+
+
+static int swscale(SwsContext *c, const uint8_t *src[],
+                   int srcStride[], int srcSliceY,
+                   int srcSliceH, uint8_t *dst[], int dstStride[])
+{
+    /* load a few things into local vars to make the code more readable?
+     * and faster */
+    const int dstW                   = c->dstW;
+    const int dstH                   = c->dstH;
+
+    const enum AVPixelFormat dstFormat = c->dstFormat;
+    const int flags                  = c->flags;
+
+    const int vLumFilterSize         = c->vLumFilterSize;
+    const int vChrFilterSize         = c->vChrFilterSize;
+
+    yuv2planar1_fn yuv2plane1        = c->yuv2plane1;
+    yuv2planarX_fn yuv2planeX        = c->yuv2planeX;
+    yuv2interleavedX_fn yuv2nv12cX   = c->yuv2nv12cX;
+    yuv2packed1_fn yuv2packed1       = c->yuv2packed1;
+    yuv2packed2_fn yuv2packed2       = c->yuv2packed2;
+    yuv2packedX_fn yuv2packedX       = c->yuv2packedX;
+    yuv2anyX_fn yuv2anyX             = c->yuv2anyX;
+    const int chrSrcSliceY           =                srcSliceY >> c->chrSrcVSubSample;
+    const int chrSrcSliceH           = AV_CEIL_RSHIFT(srcSliceH,   c->chrSrcVSubSample);
+    int should_dither                = isNBPS(c->srcFormat) ||
+                                       is16BPS(c->srcFormat);
+    int lastDstY;
+
+    /* vars which will change and which we need to store back in the context */
+    int dstY         = c->dstY;
+    int lastInLumBuf = c->lastInLumBuf;
+    int lastInChrBuf = c->lastInChrBuf;
+
+
+    int lumStart = 0;
+
+    SwsSlice *src_slice = &c->slice[lumStart];
+    SwsSlice *hout_slice = &c->slice[c->numSlice-2];
+    SwsSlice *vout_slice = &c->slice[c->numSlice-1];
+
+    int needAlpha = c->needAlpha;
+    SwsContextStep *step;
+    int last_chunk;
+
+#if HAVE_THREADS
+    int nbthreads = c->sw_nbthreads;
+    int left_lines;
+    int lines_per_thread = 0;
+    struct SwsContextThread *ctx;
+#endif
+
+    if (isPacked(c->srcFormat)) {
+        src[0] =
+        src[1] =
+        src[2] =
+        src[3] = src[0];
+        srcStride[0] =
+        srcStride[1] =
+        srcStride[2] =
+        srcStride[3] = srcStride[0];
+    }
+    srcStride[1] <<= c->vChrDrop;
+    srcStride[2] <<= c->vChrDrop;
+
+    DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
+                  src[0], srcStride[0], src[1], srcStride[1],
+                  src[2], srcStride[2], src[3], srcStride[3],
+                  dst[0], dstStride[0], dst[1], dstStride[1],
+                  dst[2], dstStride[2], dst[3], dstStride[3]);
+    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
+                  srcSliceY, srcSliceH, dstY, dstH);
+    DEBUG_BUFFERS("vLumFilterSize: %d vChrFilterSize: %d\n",
+                  vLumFilterSize, vChrFilterSize);
+
+    if (dstStride[0]&15 || dstStride[1]&15 ||
+        dstStride[2]&15 || dstStride[3]&15) {
+        static int warnedAlready = 0; // FIXME maybe move this into the context
+        if (flags & SWS_PRINT_INFO && !warnedAlready) {
+            av_log(c, AV_LOG_WARNING,
+                   "Warning: dstStride is not aligned!\n"
+                   "         ->cannot do aligned memory accesses anymore\n");
+            warnedAlready = 1;
+        }
+    }
+
+    if (   (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
+        || (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
+        || dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
+        || srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
+    ) {
+        static int warnedAlready=0;
+        int cpu_flags = av_get_cpu_flags();
+        if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
+            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speed loss\n");
+            warnedAlready=1;
+        }
+    }
+
+    /* Note the user might start scaling the picture in the middle so this
+     * will not get executed. This is not really intended but works
+     * currently, so people might do it. */
+    if (srcSliceY == 0) {
+        dstY         = 0;
+        lastInLumBuf = -1;
+        lastInChrBuf = -1;
+    }
+
+    if (!should_dither) {
+        c->chrDither8 = c->lumDither8 = sws_pb_64;
+    }
+    lastDstY = dstY;
+
+    ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
+                   yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);
+
+    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
+            srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
+
+    ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
+            dstY, dstH, dstY >> c->chrDstVSubSample,
+            AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
+    if (srcSliceY == 0) {
+        hout_slice->plane[0].sliceY = lastInLumBuf + 1;
+        hout_slice->plane[1].sliceY = lastInChrBuf + 1;
+        hout_slice->plane[2].sliceY = lastInChrBuf + 1;
+        hout_slice->plane[3].sliceY = lastInLumBuf + 1;
+
+        hout_slice->plane[0].sliceH =
+        hout_slice->plane[1].sliceH =
+        hout_slice->plane[2].sliceH =
+        hout_slice->plane[3].sliceH = 0;
+        hout_slice->width = dstW;
+    }
+
+    last_chunk = dstH - dstY;
+
+#if HAVE_THREADS
+    left_lines = last_chunk;
+
+    if (nbthreads > 1 && c->threads_ctx) {
+        int slice_round = 64;
+
+        /* Calculate two last lines at the end of threads. */
+        last_chunk = 2;
+        left_lines = left_lines - last_chunk;
+        lines_per_thread = (left_lines + nbthreads -1)/nbthreads;
+
+        if (lines_per_thread < slice_round)
+            lines_per_thread = slice_round;
+        else if (lines_per_thread & (slice_round - 1))
+            lines_per_thread += slice_round - (lines_per_thread & (slice_round - 1));
+
+        if (lines_per_thread > left_lines)
+            lines_per_thread =  left_lines;
+
+        nbthreads = (left_lines + lines_per_thread -1)/lines_per_thread;
+    } else {
+        nbthreads = 0;
+    }
+
+    swscale_threads_prepare(c);
+
+    for (int s = 0; s < nbthreads; s++) {
+        int chunk = lines_per_thread;
+        if (chunk > left_lines) {
+            chunk = left_lines;
+            /* Use current thread to calc last part. */
+            last_chunk += left_lines;
+            break;
+        }
+
+        left_lines -= chunk;
+        if (chunk <= 0)
+            break;
+
+        ctx = &c->threads_ctx[s];
+        step = &ctx->func_ctx->step_param;
+
+        step->dstY= dstY + s * lines_per_thread;
+        step->dstHend = dstY + s * lines_per_thread + chunk;
+        step->dstH = dstH;
+        step->srcSliceY = srcSliceY;
+        step->srcSliceH = srcSliceH;
+
+        pthread_mutex_lock(&ctx->process_mutex);
+        ctx->t_work = 1;
+        pthread_cond_signal(&ctx->process_cond);
+        pthread_mutex_unlock(&ctx->process_mutex);
+   }
+
+#endif
+
+   /*
+    * Calculate last /all lines in slice at the end
+    * to actualize original SwsContext structure.
+    */
+    step = &c->step_param;
+    step->dstY= dstH - last_chunk;
+    step->dstHend = dstH;
+    step->dstH = dstH;
+    step->srcSliceY = srcSliceY;
+    step->srcSliceH = srcSliceH;
+    swscale_step(c);
+
+    dstY = c->dstY;
+
+#if HAVE_THREADS
+    swscale_thread_wait_finish(c);
+#endif
+
+
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !needAlpha) {
         int length = dstW;
         int height = dstY - lastDstY;
@@ -527,13 +688,6 @@ static int swscale(SwsContext *c, const uint8_t *src[],
 #endif
     emms_c();
 
-    /* store changed local vars back in the context */
-    c->dstY         = dstY;
-    c->lumBufIndex  = lumBufIndex;
-    c->chrBufIndex  = chrBufIndex;
-    c->lastInLumBuf = lastInLumBuf;
-    c->lastInChrBuf = lastInChrBuf;
-
     return dstY - lastDstY;
 }
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 4fa5938..cbd387c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -275,6 +275,17 @@ typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
 
 struct SwsSlice;
 struct SwsFilterDescriptor;
+#if HAVE_THREADS
+struct SwsContextThread;
+#endif
+
+typedef struct  SwsContextStep {
+    int dstY;
+    int dstHend;
+    int dstH;
+    int srcSliceY;
+    int srcSliceH;
+} SwsContextStep;
 
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext {
@@ -625,9 +636,44 @@ typedef struct SwsContext {
     SwsDither dither;
 
     SwsAlphaBlend alphablend;
+
+    /*
+     * Parent set if work on copy of SwsContext for multithreading.
+     */
+    struct SwsContext *parent;
+
+    /*
+     * Temporary variable to processing swscale_step().
+     */
+    SwsContextStep step_param;
+
+#if HAVE_THREADS
+    int is_threads_prepared;
+    int sw_nbthreads; //Number of threads to processing scale
+    struct SwsContextThread *threads_ctx;
+
+#endif
+
 } SwsContext;
 //FIXME check init (where 0)
 
+#if HAVE_THREADS
+struct SwsContextThread {
+    void (*func_pfn)(SwsContext *c);
+    SwsContext *func_ctx;
+
+    pthread_t f_thread;
+    pthread_cond_t process_cond;
+    pthread_cond_t finish_cond;
+    pthread_mutex_t process_mutex;
+    pthread_mutex_t finish_mutex;
+    volatile int t_work;
+    volatile int t_end;
+};
+
+void swscale_thread_wait_finish(struct SwsContext *c);
+#endif
+
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
 int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
                              int fullRange, int brightness,
diff --git a/libswscale/utils.c b/libswscale/utils.c
index d5913ed..5446411 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -27,6 +27,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
+#include <pthread.h>
 #if HAVE_MMAP
 #include <sys/mman.h>
 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
@@ -1156,6 +1157,144 @@ static enum AVPixelFormat alphaless_fmt(enum AVPixelFormat fmt)
     }
 }
 
+
+#if HAVE_THREADS
+static void *swscale_thread(void *arg)
+{
+	struct SwsContextThread *ctx = (struct SwsContextThread *)arg;
+
+    while(1) {
+        pthread_mutex_lock(&ctx->process_mutex);
+        while (ctx->t_work == 0 && !ctx->t_end)
+            pthread_cond_wait(&ctx->process_cond, &ctx->process_mutex);
+        pthread_mutex_unlock(&ctx->process_mutex);
+
+        if (ctx->t_end)
+            break;
+
+        ctx->func_pfn(ctx->func_ctx);
+
+        pthread_mutex_lock(&ctx->finish_mutex);
+        ctx->t_work = 0;
+        pthread_cond_signal(&ctx->finish_cond);
+        pthread_mutex_unlock(&ctx->finish_mutex);
+    }
+
+	return NULL;
+}
+
+static void swscale_thread_deinit(SwsContext *c)
+{
+    struct SwsContextThread *ctx;
+    SwsContext *context;
+    int i;
+
+    if (!c->threads_ctx)
+        return;
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+        ctx = &c->threads_ctx[i];
+        pthread_mutex_lock(&ctx->process_mutex);
+        ctx->t_end = 1;
+        pthread_cond_signal(&ctx->process_cond);
+        pthread_mutex_unlock(&ctx->process_mutex);
+    }
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+        if (c->threads_ctx[i].f_thread)
+            pthread_join(c->threads_ctx[i].f_thread, NULL);
+    }
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+        ctx = &c->threads_ctx[i];
+        pthread_mutex_destroy(&ctx->process_mutex);
+        pthread_mutex_destroy(&ctx->finish_mutex);
+        pthread_cond_destroy(&ctx->process_cond);
+        pthread_cond_destroy(&ctx->finish_cond);
+        context = ctx->func_ctx;
+        if(context){
+            ff_free_filters(context);
+            av_free(context);
+            ctx->func_ctx = NULL;
+        }
+    }
+
+    av_free(c->threads_ctx);
+    c->threads_ctx = NULL;
+}
+
+static int swscale_thread_init(SwsContext *c)
+{
+    struct SwsContextThread *ctx;
+    SwsContext *copy_ctx;
+    int ret = 0;
+    int i;
+
+    c->threads_ctx = av_mallocz(c->sw_nbthreads * sizeof(*c->threads_ctx));
+    if (!c->threads_ctx) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    c->is_threads_prepared = 0;
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+            ctx = &c->threads_ctx[i];
+            ctx->t_work = 0;
+            ctx->t_end = 0;
+            pthread_mutex_init(&ctx->process_mutex, NULL);
+            pthread_mutex_init(&ctx->finish_mutex, NULL);
+            pthread_cond_init(&ctx->process_cond, NULL);
+            pthread_cond_init(&ctx->finish_cond, NULL);
+     }
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+        ctx = &c->threads_ctx[i];
+
+        copy_ctx = sws_alloc_context();
+        if (!copy_ctx) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        ctx->func_ctx = copy_ctx;
+    }
+
+    for (i = 0; i < c->sw_nbthreads; ++i) {
+        ctx = &c->threads_ctx[i];
+
+        if ((ret = pthread_create(&c->threads_ctx[i].f_thread, NULL, swscale_thread, &c->threads_ctx[i]))) {
+            if (!copy_ctx) {
+                ret = AVERROR(ret);
+                goto fail;
+            }
+        }
+    }
+
+fail:
+    if (ret)
+        swscale_thread_deinit(c);
+
+    return ret;
+}
+
+void swscale_thread_wait_finish(SwsContext *c)
+{
+    int i;
+    if (!c->sw_nbthreads)
+        return;
+
+    for (i = 0; i < c->sw_nbthreads; i++) {
+        struct SwsContextThread *ctx = &c->threads_ctx[i];
+        pthread_mutex_lock(&ctx->finish_mutex);
+        while(ctx->t_work != 0)
+            pthread_cond_wait(&ctx->finish_cond, &ctx->finish_mutex);
+        pthread_mutex_unlock(&ctx->finish_mutex);
+    }
+}
+
+#endif
+
 av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                              SwsFilter *dstFilter)
 {
@@ -1823,7 +1962,14 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     }
 
     c->swscale = ff_getSwsFunc(c);
-    return ff_init_filters(c);
+
+    ret = ff_init_filters(c);
+
+#if HAVE_THREADS
+    ret = swscale_thread_init(c);
+#endif
+
+    return ret;
 fail: // FIXME replace things by appropriate error codes
     if (ret == RETCODE_USE_CASCADE)  {
         int tmpW = sqrt(srcW * (int64_t)dstW);
@@ -2308,6 +2454,10 @@ void sws_freeContext(SwsContext *c)
     if (!c)
         return;
 
+#if HAVE_THREADS
+    swscale_thread_deinit(c);
+#endif
+
     for (i = 0; i < 4; i++)
         av_freep(&c->dither_error[i]);
 
-- 
1.8.3.1

--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.



More information about the ffmpeg-devel mailing list