[FFmpeg-devel] [PATCH v4 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function
Xu Jun
xujunzz at sjtu.edu.cn
Sun Sep 6 15:19:11 EEST 2020
Hi, Steven
----- Original Message -----
> From: "Steven Liu" <lingjiujianke at gmail.com>
> To: "FFmpeg development discussions and patches" <ffmpeg-devel at ffmpeg.org>
> Sent: Saturday, September 5, 2020 6:07:45 AM
> Subject: Re: [FFmpeg-devel] [PATCH v4 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function
> <xujunzz at sjtu.edu.cn> 于2020年9月4日周五 下午11:09写道:
>>
>> From: Xu Jun <xujunzz at sjtu.edu.cn>
>>
>> Use pthread to multithread dnn_execute_layer_conv2d.
>> Can be tested with command "./ffmpeg_g -i input.png -vf \
>> format=yuvj420p,dnn_processing=dnn_backend=native:model= \
>> espcn.model:input=x:output=y:options=conv2d_threads=23 \
>> -y sr_native.jpg -benchmark"
>>
>> before patch: utime=11.238s stime=0.005s rtime=11.248s
>> after patch: utime=20.817s stime=0.047s rtime=1.051s
>> on my 3900X 12c24t @4.2GHz
>>
>> About the increase of utime, it's because that CPU HyperThreading
>> technology makes logical cores twice of physical cores while cpu's
>> counting performance improves less than double. And utime sums
>> all cpu's logical cores' runtime. As a result, using threads num
>> near cpu's logical core's number will double utime, while reduce
>> rtime less than half for HyperThreading CPUs.
>>
>> Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
>> ---
>> v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test
>> dnn-layer-conv2d-test.c
>> v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c
>>
>> .../dnn/dnn_backend_native_layer_conv2d.c | 107 ++++++++++++++++--
>> tests/dnn/dnn-layer-conv2d-test.c | 14 ++-
>> 2 files changed, 108 insertions(+), 13 deletions(-)
>>
>> diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> index d079795bf8..4068a13ab4 100644
>> --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> @@ -19,10 +19,27 @@
>> */
>>
>> #include "libavutil/avassert.h"
>> +#include "libavutil/thread.h"
>> +#include "libavutil/cpu.h"
>> #include "dnn_backend_native_layer_conv2d.h"
>>
>> #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
>>
>> +//struct to pass parameters
>> +typedef struct thread_common_param{
>> + DnnOperand *operands;
>> + const int32_t *input_operand_indexes;
>> + int32_t output_operand_index;
>> + const void *parameters;
>> + NativeContext *ctx;
>> + int thread_num;
>> +} thread_common_param;
>> +
>> +typedef struct thread_param{
>> + thread_common_param *thread_common_param;
>> + int thread_index;
>> +} thread_param;
>> +
>> int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int
>> file_size, int operands_num)
>> {
>> ConvolutionalParams *conv_params;
>> @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext
>> *model_file_context, int fil
>> return dnn_size;
>> }
>>
>> -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
>> *input_operand_indexes,
>> - int32_t output_operand_index, const void
>> *parameters, NativeContext *ctx)
>> +static void * dnn_execute_layer_conv2d_thread(void *threadarg)
>> {
>> + //pass parameters
>> + thread_param *thread_param = (struct thread_param *)threadarg;
>> + thread_common_param *thread_common_param =
>> thread_param->thread_common_param;
>> + DnnOperand *operands = thread_common_param->operands;
>> float *output;
>> - int32_t input_operand_index = input_operand_indexes[0];
>> + int32_t input_operand_index =
>> thread_common_param->input_operand_indexes[0];
>> int number = operands[input_operand_index].dims[0];
>> int height = operands[input_operand_index].dims[1];
>> int width = operands[input_operand_index].dims[2];
>> int channel = operands[input_operand_index].dims[3];
>> const float *input = operands[input_operand_index].data;
>> - const ConvolutionalParams *conv_params = (const ConvolutionalParams
>> *)parameters;
>> + const ConvolutionalParams *conv_params = (const ConvolutionalParams
>> *)(thread_common_param->parameters);
>>
>> int radius = conv_params->kernel_size >> 1;
>> int src_linesize = width * conv_params->input_num;
>> @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>> int filter_size = conv_params->kernel_size * filter_linesize;
>> int pad_size = (conv_params->padding_method == VALID) ?
>> (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>>
>> - DnnOperand *output_operand = &operands[output_operand_index];
>> + int thread_stride = (height - pad_size * 2) /
>> thread_common_param->thread_num;
>> + int thread_start = thread_stride * thread_param->thread_index + pad_size;
>> + int thread_end = (thread_param->thread_index ==
>> thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start +
>> thread_stride);
>> +
>> + DnnOperand *output_operand =
>> &operands[thread_common_param->output_operand_index];
>> output_operand->dims[0] = number;
>> output_operand->dims[1] = height - pad_size * 2;
>> output_operand->dims[2] = width - pad_size * 2;
>> @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>> output_operand->data_type = operands[input_operand_index].data_type;
>> output_operand->length = calculate_operand_data_length(output_operand);
>> if (output_operand->length <= 0) {
>> - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
>> - return DNN_ERROR;
>> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length
>> overflow\n");
>> + return (void *)DNN_ERROR;
>> }
>> output_operand->data = av_realloc(output_operand->data, output_operand->length);
>> if (!output_operand->data) {
>> - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
>> - return DNN_ERROR;
>> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate
>> memory for output\n");
>> + return (void *)DNN_ERROR;
>> }
>> +
>> output = output_operand->data;
>> + output += (conv_params->output_num) * (width - 2 * pad_size) *
>> (thread_start - pad_size);
>>
>> av_assert0(channel == conv_params->input_num);
>>
>> - for (int y = pad_size; y < height - pad_size; ++y) {
>> + for (int y = thread_start; y < thread_end; ++y) {
>> for (int x = pad_size; x < width - pad_size; ++x) {
>> for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
>> if (conv_params->has_bias)
>> @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>> output += conv_params->output_num;
>> }
>> }
>> - return 0;
>> + return (void *)0;
> why do you return a (void *) 0, I saw dnn_execute_layer_conv2d is int type.
Actually this should return a (void *)DNN_SUCCESS to be consistent with other codes.
Thank you for pointing that out!
>> +}
>> +
>> +
>> +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
>> *input_operand_indexes,
>> + int32_t output_operand_index, const void
>> *parameters, NativeContext *ctx)
>> +{
>> + int thread_num = (ctx->options.conv2d_threads <= 0 ||
>> ctx->options.conv2d_threads > av_cpu_count())
>> + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
>> +#if HAVE_PTHREAD_CANCEL
>> + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
>> +#endif
>> + thread_param **thread_param = av_malloc(thread_num *
>> sizeof(*thread_param));
>> + void *res;
>> + int error_flag = 0;
>> +
>> + //struct used to pass parameters
>> + thread_common_param thread_common_param;
>> + thread_common_param.operands = operands;
>> + thread_common_param.input_operand_indexes = input_operand_indexes;
>> + thread_common_param.output_operand_index = output_operand_index;
>> + thread_common_param.parameters = parameters;
>> + thread_common_param.ctx = ctx;
>> +#if HAVE_PTHREAD_CANCEL
>> + thread_common_param.thread_num = thread_num;
>> +
>> + //create threads
>> + for (int i = 0; i < thread_num; i++){
>> + thread_param[i] = av_malloc(sizeof(thread_param));
>> + thread_param[i]->thread_common_param = &thread_common_param;
>> + thread_param[i]->thread_index = i;
>> + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread,
>> (void *)thread_param[i]);
>> + }
>> +
>> + //join threads, res gets function return
>> + for (int i = 0; i < thread_num; i++){
>> + pthread_join(thread_id[i], &res);
>> + if ((int)res != 0)
>> + error_flag = (int)res;
>> + }
>> +
>> + //release memory
>> + av_free(thread_id);
>> +
>> + for (int i = 0; i < thread_num; i++){
>> + av_free(thread_param[i]);
>> + }
>> +#else
>> + thread_common_param.thread_num = 1;
>> + thread_param[0] = av_malloc(sizeof(thread_param));
>> + thread_param[0]->thread_common_param = &thread_common_param;
>> + thread_param[0]->thread_index = 0;
>> + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
>> + if ((int)res != 0)
>> + error_flag = (int)res;
>> + av_free(thread_param[0]);
>> +#endif
>> +
>> + av_free(thread_param);
>> + return error_flag;
>> }
>> diff --git a/tests/dnn/dnn-layer-conv2d-test.c
>> b/tests/dnn/dnn-layer-conv2d-test.c
>> index 836839cc64..378a05eafc 100644
>> --- a/tests/dnn/dnn-layer-conv2d-test.c
>> +++ b/tests/dnn/dnn-layer-conv2d-test.c
>> @@ -25,6 +25,8 @@
>>
>> #define EPSON 0.00001
>>
>> +extern const AVClass dnn_native_class;
>> +
>> static int test_with_same_dilate(void)
>> {
>> // the input data and expected data are generated with below python code.
>> @@ -96,6 +98,10 @@ static int test_with_same_dilate(void)
>> };
>> float bias[2] = { -1.6574852, -0.72915393 };
>>
>> + NativeContext ctx;
>> + ctx.class = &dnn_native_class;
>> + ctx.options.conv2d_threads = 1;
>> +
>> params.activation = TANH;
>> params.has_bias = 1;
>> params.biases = bias;
>> @@ -114,7 +120,7 @@ static int test_with_same_dilate(void)
>> operands[1].data = NULL;
>>
>> input_indexes[0] = 0;
>> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL);
>> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx);
>>
>> output = operands[1].data;
>> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>> @@ -196,6 +202,10 @@ static int test_with_valid(void)
>> };
>> float bias[2] = { -0.4773722, -0.19620377 };
>>
>> + NativeContext ctx;
>> + ctx.class = &dnn_native_class;
>> + ctx.options.conv2d_threads = 1;
>> +
>> params.activation = TANH;
>> params.has_bias = 1;
>> params.biases = bias;
>> @@ -214,7 +224,7 @@ static int test_with_valid(void)
>> operands[1].data = NULL;
>>
>> input_indexes[0] = 0;
>> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL);
>> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx);
>>
>> output = operands[1].data;
>> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>> --
>> 2.28.0
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel at ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
>
> Thanks
> Steven
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
Thanks
- Xu Jun
More information about the ffmpeg-devel
mailing list