[FFmpeg-devel] [PATCH] avfilter/pthread: rewrite implementation
Muhammad Faiz
mfcc64 at gmail.com
Fri Jul 7 05:25:46 EEST 2017
Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
uses distict mutex/cond.
Benchmark using afir with threads=4 and 4096 taps fir:
channels=1:
old:
2128210 decicycles in afir_execute, 2 runs, 0 skips
1382927 decicycles in afir_execute, 1024 runs, 0 skips
1367985 decicycles in afir_execute, 16374 runs, 10 skips
new:
1011270 decicycles in afir_execute, 2 runs, 0 skips
939891 decicycles in afir_execute, 1024 runs, 0 skips
955812 decicycles in afir_execute, 16383 runs, 1 skips
channels=2:
old:
2801720 decicycles in afir_execute, 2 runs, 0 skips
1624556 decicycles in afir_execute, 1024 runs, 0 skips
1722584 decicycles in afir_execute, 16380 runs, 4 skips
new:
1864780 decicycles in afir_execute, 2 runs, 0 skips
1307955 decicycles in afir_execute, 1024 runs, 0 skips
1110917 decicycles in afir_execute, 16384 runs, 0 skips
channels=3:
old:
3031255 decicycles in afir_execute, 2 runs, 0 skips
2545295 decicycles in afir_execute, 1024 runs, 0 skips
2498368 decicycles in afir_execute, 16384 runs, 0 skips
new:
2213540 decicycles in afir_execute, 2 runs, 0 skips
2305479 decicycles in afir_execute, 1024 runs, 0 skips
2001942 decicycles in afir_execute, 16382 runs, 2 skips
channels=4:
old:
4642510 decicycles in afir_execute, 2 runs, 0 skips
3356856 decicycles in afir_execute, 1024 runs, 0 skips
2994766 decicycles in afir_execute, 16382 runs, 2 skips
new:
3590650 decicycles in afir_execute, 2 runs, 0 skips
2456035 decicycles in afir_execute, 1024 runs, 0 skips
2332966 decicycles in afir_execute, 16384 runs, 0 skips
channels=6:
old:
5057785 decicycles in afir_execute, 2 runs, 0 skips
4279000 decicycles in afir_execute, 1023 runs, 1 skips
4102256 decicycles in afir_execute, 16383 runs, 1 skips
new:
4244160 decicycles in afir_execute, 2 runs, 0 skips
3851306 decicycles in afir_execute, 1024 runs, 0 skips
3343221 decicycles in afir_execute, 16384 runs, 0 skips
channels=8:
old:
4871740 decicycles in afir_execute, 2 runs, 0 skips
4807337 decicycles in afir_execute, 1023 runs, 1 skips
4454018 decicycles in afir_execute, 16374 runs, 10 skips
new:
5055460 decicycles in afir_execute, 2 runs, 0 skips
4554674 decicycles in afir_execute, 1023 runs, 1 skips
4433398 decicycles in afir_execute, 16382 runs, 2 skips
Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
---
libavfilter/pthread.c | 189 +++++++++++++++++++++++++++++++-------------------
1 file changed, 116 insertions(+), 73 deletions(-)
diff --git a/libavfilter/pthread.c b/libavfilter/pthread.c
index c7a0021..8fb3409 100644
--- a/libavfilter/pthread.c
+++ b/libavfilter/pthread.c
@@ -21,6 +21,7 @@
* Libavfilter multithreading support
*/
+#include <stdatomic.h>
#include "config.h"
#include "libavutil/common.h"
@@ -32,61 +33,75 @@
#include "internal.h"
#include "thread.h"
+typedef struct WorkerContext WorkerContext;
+
typedef struct ThreadContext {
AVFilterGraph *graph;
- int nb_threads;
- pthread_t *workers;
+ int nb_workers;
+ WorkerContext *workers;
avfilter_action_func *func;
/* per-execute parameters */
AVFilterContext *ctx;
void *arg;
int *rets;
- int nb_jobs;
+ unsigned nb_jobs;
- pthread_cond_t last_job_cond;
- pthread_cond_t current_job_cond;
- pthread_mutex_t current_job_lock;
- int current_job;
- unsigned int current_execute;
+ pthread_mutex_t mutex_done;
+ pthread_cond_t cond_done;
+ atomic_uint current_job;
+ atomic_uint nb_finished_jobs;
int done;
} ThreadContext;
-static void* attribute_align_arg worker(void *v)
+struct WorkerContext {
+ ThreadContext *ctx;
+ pthread_t thread;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ int done;
+};
+
+static unsigned run_jobs(ThreadContext *c)
{
- ThreadContext *c = v;
- int our_job = c->nb_jobs;
- int nb_threads = c->nb_threads;
- unsigned int last_execute = 0;
- int ret, self_id;
-
- pthread_mutex_lock(&c->current_job_lock);
- self_id = c->current_job++;
-
- for (;;) {
- while (our_job >= c->nb_jobs) {
- if (c->current_job == nb_threads + c->nb_jobs)
- pthread_cond_signal(&c->last_job_cond);
-
- while (last_execute == c->current_execute && !c->done)
- pthread_cond_wait(&c->current_job_cond, &c->current_job_lock);
- last_execute = c->current_execute;
- our_job = self_id;
-
- if (c->done) {
- pthread_mutex_unlock(&c->current_job_lock);
- return NULL;
- }
- }
- pthread_mutex_unlock(&c->current_job_lock);
+ unsigned current_job, nb_finished_jobs = 0;
- ret = c->func(c->ctx, c->arg, our_job, c->nb_jobs);
+ while (nb_finished_jobs != c->nb_jobs &&
+ (current_job = atomic_fetch_add_explicit(&c->current_job, 1, memory_order_acq_rel)) < c->nb_jobs) {
+ int ret = c->func(c->ctx, c->arg, current_job, c->nb_jobs);
if (c->rets)
- c->rets[our_job % c->nb_jobs] = ret;
+ c->rets[current_job] = ret;
+ nb_finished_jobs = atomic_fetch_add_explicit(&c->nb_finished_jobs, 1, memory_order_acq_rel) + 1;
+ }
+
+ return nb_finished_jobs;
+}
+
+static void* attribute_align_arg worker(void *v)
+{
+ WorkerContext *w = v;
+ ThreadContext *c = w->ctx;
+
+ pthread_mutex_lock(&w->mutex);
+ pthread_cond_signal(&w->cond);
+
+ while (1) {
+ w->done = 1;
+ while (w->done)
+ pthread_cond_wait(&w->cond, &w->mutex);
+
+ if (c->done) {
+ pthread_mutex_unlock(&w->mutex);
+ return NULL;
+ }
- pthread_mutex_lock(&c->current_job_lock);
- our_job = c->current_job++;
+ if (run_jobs(c) == c->nb_jobs) {
+ pthread_mutex_lock(&c->mutex_done);
+ c->done = 1;
+ pthread_cond_signal(&c->cond_done);
+ pthread_mutex_unlock(&c->mutex_done);
+ }
}
}
@@ -94,48 +109,66 @@ static void slice_thread_uninit(ThreadContext *c)
{
int i;
- pthread_mutex_lock(&c->current_job_lock);
c->done = 1;
- pthread_cond_broadcast(&c->current_job_cond);
- pthread_mutex_unlock(&c->current_job_lock);
+ for (i = 0; i < c->nb_workers; i++) {
+ WorkerContext *w = &c->workers[i];
- for (i = 0; i < c->nb_threads; i++)
- pthread_join(c->workers[i], NULL);
+ pthread_mutex_lock(&w->mutex);
+ w->done = 0;
+ pthread_cond_signal(&w->cond);
+ pthread_mutex_unlock(&w->mutex);
- pthread_mutex_destroy(&c->current_job_lock);
- pthread_cond_destroy(&c->current_job_cond);
- pthread_cond_destroy(&c->last_job_cond);
- av_freep(&c->workers);
-}
+ pthread_join(w->thread, NULL);
+ pthread_cond_destroy(&w->cond);
+ pthread_mutex_destroy(&w->mutex);
+ }
-static void slice_thread_park_workers(ThreadContext *c)
-{
- while (c->current_job != c->nb_threads + c->nb_jobs)
- pthread_cond_wait(&c->last_job_cond, &c->current_job_lock);
- pthread_mutex_unlock(&c->current_job_lock);
+ pthread_cond_destroy(&c->cond_done);
+ pthread_mutex_destroy(&c->mutex_done);
+ av_freep(&c->workers);
}
static int thread_execute(AVFilterContext *ctx, avfilter_action_func *func,
- void *arg, int *ret, int nb_jobs)
+ void *arg, int *rets, int nb_jobs)
{
ThreadContext *c = ctx->graph->internal->thread;
+ int nb_workers, i;
if (nb_jobs <= 0)
return 0;
- pthread_mutex_lock(&c->current_job_lock);
+ if (nb_jobs == 1) {
+ int ret = func(ctx, arg, 0, 1);
+ if (rets)
+ rets[0] = ret;
+ return 0;
+ }
- c->current_job = c->nb_threads;
+ nb_workers = FFMIN(c->nb_workers, nb_jobs - 1);
+ atomic_store_explicit(&c->current_job, 0, memory_order_relaxed);
+ atomic_store_explicit(&c->nb_finished_jobs, 0, memory_order_relaxed);
c->nb_jobs = nb_jobs;
c->ctx = ctx;
c->arg = arg;
c->func = func;
- c->rets = ret;
- c->current_execute++;
+ c->rets = rets;
+
+ for (i = 0; i < nb_workers; i++) {
+ WorkerContext *w = &c->workers[i];
- pthread_cond_broadcast(&c->current_job_cond);
+ pthread_mutex_lock(&w->mutex);
+ w->done = 0;
+ pthread_cond_signal(&w->cond);
+ pthread_mutex_unlock(&w->mutex);
+ }
- slice_thread_park_workers(c);
+ if (run_jobs(c) != c->nb_jobs) {
+ pthread_mutex_lock(&c->mutex_done);
+ while (!c->done)
+ pthread_cond_wait(&c->cond_done, &c->mutex_done);
+ c->done = 0;
+ pthread_mutex_unlock(&c->mutex_done);
+ }
return 0;
}
@@ -156,33 +189,43 @@ static int thread_init_internal(ThreadContext *c, int nb_threads)
if (nb_threads <= 1)
return 1;
- c->nb_threads = nb_threads;
- c->workers = av_mallocz_array(sizeof(*c->workers), nb_threads);
+ c->nb_workers = nb_threads - 1;
+ c->workers = av_mallocz_array(sizeof(*c->workers), c->nb_workers);
if (!c->workers)
return AVERROR(ENOMEM);
- c->current_job = 0;
+ pthread_mutex_init(&c->mutex_done, NULL);
+ pthread_cond_init(&c->cond_done, NULL);
+ atomic_init(&c->current_job, 0);
+ atomic_init(&c->nb_finished_jobs, 0);
c->nb_jobs = 0;
c->done = 0;
- pthread_cond_init(&c->current_job_cond, NULL);
- pthread_cond_init(&c->last_job_cond, NULL);
+ for (i = 0; i < c->nb_workers; i++) {
+ WorkerContext *w = &c->workers[i];
- pthread_mutex_init(&c->current_job_lock, NULL);
- pthread_mutex_lock(&c->current_job_lock);
- for (i = 0; i < nb_threads; i++) {
- ret = pthread_create(&c->workers[i], NULL, worker, c);
+ w->ctx = c;
+ pthread_mutex_init(&w->mutex, NULL);
+ pthread_cond_init(&w->cond, NULL);
+ pthread_mutex_lock(&w->mutex);
+ w->done = 0;
+ ret = pthread_create(&w->thread, NULL, worker, w);
if (ret) {
- pthread_mutex_unlock(&c->current_job_lock);
- c->nb_threads = i;
+ c->nb_workers = i;
+ pthread_mutex_unlock(&w->mutex);
+ pthread_cond_destroy(&w->cond);
+ pthread_mutex_destroy(&w->mutex);
slice_thread_uninit(c);
return AVERROR(ret);
}
+
+ while (!w->done)
+ pthread_cond_wait(&w->cond, &w->mutex);
+ pthread_mutex_unlock(&w->mutex);
}
- slice_thread_park_workers(c);
- return c->nb_threads;
+ return c->nb_workers + 1;
}
int ff_graph_thread_init(AVFilterGraph *graph)
--
2.9.3
More information about the ffmpeg-devel
mailing list