[FFmpeg-devel] [PATCH 1/3][GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c

Mon Aug 31 20:03:40 EEST 2020

From: Xu Jun <xujunzz at sjtu.edu.cn>

Use pthread to multithread dnn_execute_layer_conv2d.
Can be tested with command "./ffmpeg_g -i input.png -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y -y sr_native.jpg -benchmark"

before patch: utime=11.238s stime=0.005s rtime=11.248s
after patch:  utime=20.817s stime=0.047s rtime=1.051s

Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
---
 .../dnn/dnn_backend_native_layer_conv2d.c     | 95 ++++++++++++++++---
 1 file changed, 84 insertions(+), 11 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index d079795bf8..570b974052 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -19,10 +19,23 @@
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/thread.h"
+#include "libavutil/cpu.h"
 #include "dnn_backend_native_layer_conv2d.h"
 
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
 
+//struct to pass parameters
+typedef struct thread_data{
+    DnnOperand *operands;
+    const int32_t *input_operand_indexes;
+    int32_t output_operand_index;
+    const void *parameters;
+    NativeContext *ctx;
+    int32_t thread_num;
+    int32_t thread_index;
+} thread_data;
+
 int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
 {
     ConvolutionalParams *conv_params;
@@ -88,17 +101,27 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
     return dnn_size;
 }
 
-int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
-                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+static void * dnn_execute_layer_conv2d_thread(void *threadarg)
 {
+    static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+    //use mutexe to protect thread_index
+
+    //pass parameters
+    struct thread_data *thread_data = (struct thread_data *)threadarg;
+    DnnOperand *operands = thread_data->operands;
+
+    int thread_stride;
+    int thread_start;
+    int thread_end;
+
     float *output;
-    int32_t input_operand_index = input_operand_indexes[0];
+    int32_t input_operand_index = thread_data->input_operand_indexes[0];
     int number = operands[input_operand_index].dims[0];
     int height = operands[input_operand_index].dims[1];
     int width = operands[input_operand_index].dims[2];
     int channel = operands[input_operand_index].dims[3];
     const float *input = operands[input_operand_index].data;
-    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters;
+    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_data->parameters);
 
     int radius = conv_params->kernel_size >> 1;
     int src_linesize = width * conv_params->input_num;
@@ -106,7 +129,7 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     int filter_size = conv_params->kernel_size * filter_linesize;
     int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
 
-    DnnOperand *output_operand = &operands[output_operand_index];
+    DnnOperand *output_operand = &operands[thread_data->output_operand_index];
     output_operand->dims[0] = number;
     output_operand->dims[1] = height - pad_size * 2;
     output_operand->dims[2] = width - pad_size * 2;
@@ -114,19 +137,30 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     output_operand->data_type = operands[input_operand_index].data_type;
     output_operand->length = calculate_operand_data_length(output_operand);
     if (output_operand->length <= 0) {
-        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
-        return DNN_ERROR;
+        av_log(thread_data->ctx, AV_LOG_ERROR, "The output data length overflow\n");
+        return (void *)DNN_ERROR;
     }
     output_operand->data = av_realloc(output_operand->data, output_operand->length);
     if (!output_operand->data) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
-        return DNN_ERROR;
+        av_log(thread_data->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
+        return (void *)DNN_ERROR;
     }
+
+    //calculate area for this thread
+    thread_stride = (height - pad_size * 2) / thread_data->thread_num;
+    pthread_mutex_lock(&mtx);
+    thread_start = thread_stride * thread_data->thread_index + pad_size;
+    thread_end = (thread_data->thread_index == thread_data->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride);
+    thread_data->thread_index += 1;
+    pthread_mutex_unlock(&mtx);
+
     output = output_operand->data;
+    //calculate output start pos for this thread
+    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
 
     av_assert0(channel == conv_params->input_num);
 
-    for (int y = pad_size; y < height - pad_size; ++y) {
+    for (int y = thread_start; y < thread_end; ++y) {
         for (int x = pad_size; x < width - pad_size; ++x) {
             for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
                 if (conv_params->has_bias)
@@ -174,5 +208,44 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
             output += conv_params->output_num;
         }
     }
-    return 0;
+    return (void *)0;
+}
+
+
+int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
+                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+{
+    //get cpu available cores, -1 for higher efficiency
+    const int thread_num = av_cpu_count() - 1;
+    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
+    void *res;
+    int error_flag = 0;
+
+    //struct used to pass parameters
+    struct thread_data *thread_data;
+    thread_data = av_malloc(sizeof(*thread_data));
+    thread_data->operands = operands;
+    thread_data->input_operand_indexes = input_operand_indexes;
+    thread_data->output_operand_index = output_operand_index;
+    thread_data->parameters = parameters;
+    thread_data->ctx = ctx;
+    thread_data->thread_num = thread_num;
+    thread_data->thread_index = 0;
+
+    //create threads
+    for (int i = 0; i < thread_num; i++){
+        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_data);
+    }
+
+    //join threads, res gets function return
+    for (int i = 0; i < thread_num; i++){
+        pthread_join(thread_id[i], &res);
+        if ((int)res != 0)
+            error_flag = (int)res;
+    }
+
+    //release memory
+    av_free(thread_id);
+    av_free(thread_data);
+    return error_flag;
 }
-- 
2.27.0