[FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect for object detection

Mon Mar 1 06:50:16 EET 2021

Guo, Yejun:
> Below are the example steps to do object detection:
> 
> 1. download and install l_openvino_toolkit_p_2021.1.110.tgz from
> https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit/download.html
>   or, we can get source code (tag 2021.1), build and install.
> 2. export LD_LIBRARY_PATH with openvino settings, for example:
> .../deployment_tools/inference_engine/lib/intel64/:.../deployment_tools/inference_engine/external/tbb/lib/
> 3. rebuild ffmpeg from source code with configure option:
> --enable-libopenvino
> --extra-cflags='-I.../deployment_tools/inference_engine/include/'
> --extra-ldflags='-L.../deployment_tools/inference_engine/lib/intel64'
> 4. download model files and test image
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.bin
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.xml
> wget
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.label
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/images/cici.jpg
> 5. run ffmpeg with:
> ./ffmpeg -i cici.jpg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:input=data:output=detection_out:confidence=0.6:labels=face-detection-adas-0001.label -f null -
> 
> We'll see the detect result as below:
> [Parsed_dnn_detect_0 @ 0x56226644e540] frame->private_ref (bounding boxes):
> [Parsed_dnn_detect_0 @ 0x56226644e540] source: face-detection-adas-0001.xml
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 0, region: (1005, 813) -> (1086, 905), label: face, confidence: 10000/10000.
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 1, region: (888, 839) -> (967, 926), label: face, confidence: 6917/10000.
> 
> There are two faces detected with confidence 100% and 69.17%.
> 
> Signed-off-by: Guo, Yejun <yejun.guo at intel.com>
> ---
>  configure                              |   1 +
>  doc/filters.texi                       |  40 +++
>  libavfilter/Makefile                   |   1 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>  libavfilter/dnn_filter_common.c        |   7 +
>  libavfilter/dnn_filter_common.h        |   1 +
>  libavfilter/dnn_interface.h            |   6 +-
>  libavfilter/vf_dnn_detect.c            | 462 +++++++++++++++++++++++++
>  9 files changed, 529 insertions(+), 2 deletions(-)
>  create mode 100644 libavfilter/vf_dnn_detect.c
> 
> diff --git a/configure b/configure
> index 336301cb40..cdac292c2f 100755
> --- a/configure
> +++ b/configure
> @@ -3549,6 +3549,7 @@ derain_filter_select="dnn"
>  deshake_filter_select="pixelutils"
>  deshake_opencl_filter_deps="opencl"
>  dilation_opencl_filter_deps="opencl"
> +dnn_detect_filter_select="dnn"
>  dnn_processing_filter_select="dnn"
>  drawtext_filter_deps="libfreetype"
>  drawtext_filter_suggest="libfontconfig libfribidi"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 426cb158da..55368c6f1b 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -10133,6 +10133,46 @@ ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>  @end example
>  @end itemize
>  
> + at section dnn_detect
> +
> +Do object detection with deep neural networks.
> +
> +The filter accepts the following options:
> +
> + at table @option
> + at item dnn_backend
> +Specify which DNN backend to use for model loading and execution. This option accepts
> +only openvino now, tensorflow backends will be added.
> +
> + at item model
> +Set path to model file specifying network architecture and its parameters.
> +Note that different backends use different file formats.
> +
> + at item input
> +Set the input name of the dnn network.
> +
> + at item output
> +Set the output name of the dnn network.
> +
> + at item confidence
> +Set the confidence threshold (default: 0.5).
> +
> + at item labels
> +Set path to label file specifying the mapping between label id and name.
> +Each label name is written in one line, tailing spaces and empty lines are skipped.
> +The first line is the name of label id 0 (usually it is 'background'),
> +and the second line is the name of label id 1, etc.
> +The label id is considered as name if the label file is not provided.
> +
> + at item backend_configs
> +Set the configs to be passed into backend
> +
> + at item async
> +use DNN async execution if set (default: set),
> +roll back to sync execution if the backend does not support async.
> +
> + at end table
> +
>  @anchor{dnn_processing}
>  @section dnn_processing
>  
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 359ea7f903..b14c0ecdb9 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -245,6 +245,7 @@ OBJS-$(CONFIG_DILATION_FILTER)               += vf_neighbor.o
>  OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        += vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
> +OBJS-$(CONFIG_DNN_DETECT_FILTER)             += vf_dnn_detect.o
>  OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
>  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
>  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 452c030706..85419ec3f1 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -230,6 +230,7 @@ extern AVFilter ff_vf_detelecine;
>  extern AVFilter ff_vf_dilation;
>  extern AVFilter ff_vf_dilation_opencl;
>  extern AVFilter ff_vf_displace;
> +extern AVFilter ff_vf_dnn_detect;
>  extern AVFilter ff_vf_dnn_processing;
>  extern AVFilter ff_vf_doubleweave;
>  extern AVFilter ff_vf_drawbox;
> diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
> index 5be053b7f8..928d84b744 100644
> --- a/libavfilter/dnn/dnn_backend_openvino.c
> +++ b/libavfilter/dnn/dnn_backend_openvino.c
> @@ -621,6 +621,12 @@ DNNReturnType ff_dnn_execute_model_ov(const DNNModel *model, const char *input_n
>          return DNN_ERROR;
>      }
>  
> +    if (model->func_type != DFT_PROCESS_FRAME) {
> +        if (!out_frame) {
> +            out_frame = in_frame;
> +        }
> +    }
> +
>      if (nb_output != 1) {
>          // currently, the filter does not need multiple outputs,
>          // so we just pending the support until we really need it.
> @@ -674,6 +680,12 @@ DNNReturnType ff_dnn_execute_model_async_ov(const DNNModel *model, const char *i
>          return DNN_ERROR;
>      }
>  
> +    if (model->func_type != DFT_PROCESS_FRAME) {
> +        if (!out_frame) {
> +            out_frame = in_frame;
> +        }
> +    }
> +
>      task = av_malloc(sizeof(*task));
>      if (!task) {
>          av_log(ctx, AV_LOG_ERROR, "unable to alloc memory for task item.\n");
> diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
> index 413adba406..92b696e710 100644
> --- a/libavfilter/dnn_filter_common.c
> +++ b/libavfilter/dnn_filter_common.c
> @@ -64,6 +64,13 @@ int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *fil
>      return 0;
>  }
>  
> +int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc)
> +{
> +    ctx->model->pre_proc = pre_proc;
> +    ctx->model->post_proc = post_proc;
> +    return 0;
> +}
> +
>  DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input)
>  {
>      return ctx->model->get_input(ctx->model->model, input, ctx->model_inputname);
> diff --git a/libavfilter/dnn_filter_common.h b/libavfilter/dnn_filter_common.h
> index 79c4d3efe3..0e88b88bdd 100644
> --- a/libavfilter/dnn_filter_common.h
> +++ b/libavfilter/dnn_filter_common.h
> @@ -48,6 +48,7 @@ typedef struct DnnContext {
>  
>  
>  int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx);
> +int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc);
>  DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input);
>  DNNReturnType ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height);
>  DNNReturnType ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame);
> diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
> index d3a0c58a61..90a08129f4 100644
> --- a/libavfilter/dnn_interface.h
> +++ b/libavfilter/dnn_interface.h
> @@ -63,6 +63,8 @@ typedef struct DNNData{
>      DNNColorOrder order;
>  } DNNData;
>  
> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model, AVFilterContext *filter_ctx);

Why uppercase? It is a typedef, not a macro.

> +
>  typedef struct DNNModel{
>      // Stores model that can be different for different backends.
>      void *model;
> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>                                  const char *output_name, int *output_width, int *output_height);
>      // set the pre process to transfer data from AVFrame to DNNData
>      // the default implementation within DNN is used if it is not provided by the filter
> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input, AVFilterContext *filter_ctx);
> +    PRE_POST_PROC pre_proc;
>      // set the post process to transfer data from DNNData to AVFrame
>      // the default implementation within DNN is used if it is not provided by the filter
> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output, AVFilterContext *filter_ctx);
> +    PRE_POST_PROC post_proc;

Spurious change.

>  } DNNModel;
>  
>  // Stores pointers to functions for loading, executing, freeing DNN models for one of the backends.
> diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
> new file mode 100644
> index 0000000000..13b82579a9
> --- /dev/null
> +++ b/libavfilter/vf_dnn_detect.c
> @@ -0,0 +1,462 @@
> +/*
> + * Copyright (c) 2021
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * implementing an object detecting filter using deep learning networks.
> + */
> +
> +#include "libavformat/avio.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/avassert.h"
> +#include "libavutil/imgutils.h"
> +#include "filters.h"
> +#include "dnn_filter_common.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "libavutil/time.h"
> +#include "bbox.h"
> +
> +typedef struct DnnDetectContext {
> +    const AVClass *class;
> +    DnnContext dnnctx;
> +    float confidence;
> +    char *labels_filename;
> +    char **labels;
> +    int label_count;
> +} DnnDetectContext;
> +
> +#define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
> +#define OFFSET2(x) offsetof(DnnDetectContext, x)
> +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
> +static const AVOption dnn_detect_options[] = {
> +    { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 2 },    INT_MIN, INT_MAX, FLAGS, "backend" },
> +#if (CONFIG_LIBOPENVINO == 1)
> +    { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = 2 },    0, 0, FLAGS, "backend" },
> +#endif
> +    DNN_COMMON_OPTIONS
> +    { "confidence",  "threshold of confidence",    OFFSET2(confidence),      AV_OPT_TYPE_FLOAT,     { .dbl = 0.5 },  0, 1, FLAGS},
> +    { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(dnn_detect);
> +
> +// remove this function once we update vf_drawbox/text
> +// to visualize the bbox
> +static void dump_boundingbox(AVFilterContext *ctx, AVFrame *frame)
> +{
> +    int nb_bbox = 0;
> +    BoundingBox *bbox;
> +    BoundingBoxHeader *header;
> +
> +    if (!frame->private_ref)
> +        return;
> +
> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> +    bbox = (BoundingBox *)(header + 1);
> +
> +    if (!header->bbox_size || (frame->private_ref->size - sizeof(*header)) % header->bbox_size != 0) {
> +        av_log(ctx, AV_LOG_ERROR, "invalid size of bounding box\n");
> +        return;
> +    }
> +
> +    nb_bbox = (frame->private_ref->size - sizeof(*header)) / header->bbox_size;
> +    av_log(ctx, AV_LOG_INFO, "frame->private_ref (bounding boxes):\n");
> +    av_log(ctx, AV_LOG_INFO, "source: %s\n", header->source);
> +    for (int i = 0; i < nb_bbox; i++) {
> +        av_log(ctx, AV_LOG_INFO, "index: %d, region: (%d, %d) -> (%d, %d), label: %s, confidence: %d/%d.\n",
> +                                 i, bbox->left, bbox->top, bbox->right, bbox->bottom,
> +                                 bbox->detect_label, bbox->detect_confidence.num, bbox->detect_confidence.den);
> +        if (bbox->classify_count > 0) {
> +            for (int j = 0; j < bbox->classify_count; j++) {
> +                av_log(ctx, AV_LOG_INFO, "\t\tclassify:  label: %s, confidence: %d/%d.\n",
> +                       bbox->classify_labels[j], bbox->classify_confidences[j].num, bbox->classify_confidences[j].den);
> +            }
> +        }
> +        bbox++;
> +    }
> +}
> +
> +static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
> +{
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +    float conf_threshold = ctx->confidence;
> +    int proposal_count = output->height;
> +    int detect_size = output->width;
> +    float *detections = output->data;
> +    int nb_bbox = 0;
> +    BoundingBox *bbox;
> +    BoundingBoxHeader *header;
> +
> +    for (int i = 0; i < proposal_count; ++i) {
> +        float conf = detections[i * detect_size + 2];
> +        if (conf < conf_threshold) {
> +            continue;
> +        }
> +        nb_bbox++;
> +    }
> +
> +    if (nb_bbox == 0) {
> +        av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
> +        return 0;
> +    }
> +
> +    if (frame->private_ref) {
> +        av_log(filter_ctx, AV_LOG_ERROR, "frame->private_ref is already occupied\n");
> +        return -1;
> +    }
> +
> +    frame->private_ref = av_buffer_alloc(sizeof(*header) + sizeof(*bbox) * nb_bbox);
> +    if (!frame->private_ref) {
> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer for %d bboxes\n", nb_bbox);
> +        return AVERROR(ENOMEM);;

Double ;

> +    }
> +
> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> +    strncpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
> +    header->source[sizeof(header->source) - 1] = '\0';
> +    header->bbox_size = sizeof(*bbox);
> +
> +    bbox = (BoundingBox *)(header + 1);

This does not guarantee proper alignment. You could use a flexible array
member for that.

> +    for (int i = 0; i < proposal_count; ++i) {
> +        int av_unused image_id = (int)detections[i * detect_size + 0];
> +        int label_id = (int)detections[i * detect_size + 1];
> +        float conf   =      detections[i * detect_size + 2];
> +        float x0     =      detections[i * detect_size + 3];
> +        float y0     =      detections[i * detect_size + 4];
> +        float x1     =      detections[i * detect_size + 5];
> +        float y1     =      detections[i * detect_size + 6];
> +
> +        if (conf < conf_threshold) {
> +            continue;
> +        }
> +
> +        bbox->left      = (int)(x0 * frame->width);
> +        bbox->right     = (int)(x1 * frame->width);
> +        bbox->top       = (int)(y0 * frame->height);
> +        bbox->bottom    = (int)(y1 * frame->height);
> +
> +        bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
> +        bbox->classify_count = 0;
> +
> +        if (ctx->labels && label_id < ctx->label_count) {
> +            strcpy(bbox->detect_label, ctx->labels[label_id]);
> +        } else {
> +            snprintf(bbox->detect_label, BBOX_LABEL_NAME_MAX_LENGTH, "%d", label_id);
> +        }
> +
> +        nb_bbox--;
> +        if (nb_bbox == 0) {
> +            break;
> +        }
> +        bbox++;
> +    }
> +
> +    dump_boundingbox(filter_ctx, frame);
> +
> +    return 0;
> +}
> +
> +static void free_detect_labels(DnnDetectContext *ctx)
> +{
> +    for (int i = 0; i < ctx->label_count; i++) {
> +        av_freep(&ctx->labels[i]);
> +    }
> +    av_freep(&ctx->labels);
> +}
> +
> +static int read_detect_label_file(AVFilterContext *context)
> +{
> +    int line_len;
> +    FILE *file;
> +    DnnDetectContext *ctx = context->priv;
> +
> +    file = av_fopen_utf8(ctx->labels_filename, "r");
> +    if (!file){
> +        av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    while (!feof(file)) {
> +        char *label;
> +        char buf[256];
> +        if (!fgets(buf, 256, file)) {
> +            break;
> +        }
> +
> +        line_len = strlen(buf);
> +        while (line_len) {
> +            int i = line_len - 1;
> +            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
> +                buf[i] = '\0';
> +                line_len--;
> +            } else {
> +                break;
> +            }
> +        }
> +
> +        if (line_len == 0)  // empty line
> +            continue;
> +
> +        if (line_len > BBOX_LABEL_NAME_MAX_LENGTH) {
> +            av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
> +            fclose(file);
> +            free_detect_labels(ctx);
> +            return AVERROR(EINVAL);
> +        }
> +
> +        label = av_strdup(buf);
> +        if (!label) {
> +            av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
> +            fclose(file);
> +            free_detect_labels(ctx);
> +            return AVERROR(ENOMEM);
> +        }
> +
> +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
> +            av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
> +            fclose(file);
> +            free_detect_labels(ctx);

1. You are leaking label here.
2. You are repeating yourself with the cleanup code.
3. When you return an error in a filter's init function, the filter's
uninit function is called automatically. In this case this means that
free_detect_labels is called twice which is not only wasteful, but
harmful: You are freeing ctx->labels (and all labels contained in it) in
the first run, but you are not resetting the number of labels. If
ctx->label_count is > 0, there will be a segfault when
free_detect_labels is called the second time.
4. Return the error code.
(5. I consider your use of av_log for every error to be excessive.)

> +            return AVERROR(ENOMEM);
> +        }
> +    }
> +
> +    fclose(file);
> +    return 0;
> +}
> +
> +static av_cold int dnn_detect_init(AVFilterContext *context)
> +{
> +    DnnDetectContext *ctx = context->priv;
> +    int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context);
> +    if (ret < 0)
> +        return ret;
> +    ff_dnn_set_proc(&ctx->dnnctx, NULL, dnn_detect_post_proc);
> +
> +    if (ctx->labels_filename) {
> +        return read_detect_label_file(context);
> +    }
> +    return 0;
> +}
> +
> +static int dnn_detect_query_formats(AVFilterContext *context)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
> +        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
> +        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_NV12,
> +        AV_PIX_FMT_NONE
> +    };
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    return ff_set_common_formats(context, fmts_list);
> +}
> +
> +static int dnn_detect_filter_frame(AVFilterLink *inlink, AVFrame *in)
> +{
> +    AVFilterContext *context  = inlink->dst;
> +    AVFilterLink *outlink = context->outputs[0];
> +    DnnDetectContext *ctx = context->priv;
> +    DNNReturnType dnn_result;
> +
> +    dnn_result = ff_dnn_execute_model(&ctx->dnnctx, in, NULL);
> +    if (dnn_result != DNN_SUCCESS){
> +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
> +        av_frame_free(&in);
> +        return AVERROR(EIO);
> +    }
> +
> +    return ff_filter_frame(outlink, in);
> +}
> +
> +static int dnn_detect_activate_sync(AVFilterContext *filter_ctx)
> +{
> +    AVFilterLink *inlink = filter_ctx->inputs[0];
> +    AVFilterLink *outlink = filter_ctx->outputs[0];
> +    AVFrame *in = NULL;
> +    int64_t pts;
> +    int ret, status;
> +    int got_frame = 0;
> +
> +    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
> +
> +    do {
> +        // drain all input frames
> +        ret = ff_inlink_consume_frame(inlink, &in);
> +        if (ret < 0)
> +            return ret;
> +        if (ret > 0) {
> +            ret = dnn_detect_filter_frame(inlink, in);
> +            if (ret < 0)
> +                return ret;
> +            got_frame = 1;
> +        }
> +    } while (ret > 0);
> +
> +    // if frame got, schedule to next filter
> +    if (got_frame)
> +        return 0;
> +
> +    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
> +        if (status == AVERROR_EOF) {
> +            ff_outlink_set_status(outlink, status, pts);
> +            return ret;
> +        }
> +    }
> +
> +    FF_FILTER_FORWARD_WANTED(outlink, inlink);
> +
> +    return FFERROR_NOT_READY;
> +}
> +
> +static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
> +{
> +    DnnDetectContext *ctx = outlink->src->priv;
> +    int ret;
> +    DNNAsyncStatusType async_state;
> +
> +    ret = ff_dnn_flush(&ctx->dnnctx);
> +    if (ret != DNN_SUCCESS) {
> +        return -1;
> +    }
> +
> +    do {
> +        AVFrame *in_frame = NULL;
> +        AVFrame *out_frame = NULL;
> +        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
> +        if (out_frame) {
> +            av_assert0(in_frame == out_frame);
> +            ret = ff_filter_frame(outlink, out_frame);
> +            if (ret < 0)
> +                return ret;
> +            if (out_pts)
> +                *out_pts = out_frame->pts + pts;
> +        }
> +        av_usleep(5000);
> +    } while (async_state >= DAST_NOT_READY);
> +
> +    return 0;
> +}
> +
> +static int dnn_detect_activate_async(AVFilterContext *filter_ctx)
> +{
> +    AVFilterLink *inlink = filter_ctx->inputs[0];
> +    AVFilterLink *outlink = filter_ctx->outputs[0];
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +    AVFrame *in = NULL;
> +    int64_t pts;
> +    int ret, status;
> +    int got_frame = 0;
> +    int async_state;
> +
> +    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
> +
> +    do {
> +        // drain all input frames
> +        ret = ff_inlink_consume_frame(inlink, &in);
> +        if (ret < 0)
> +            return ret;
> +        if (ret > 0) {
> +            if (ff_dnn_execute_model_async(&ctx->dnnctx, in, NULL) != DNN_SUCCESS) {
> +                return AVERROR(EIO);
> +            }
> +        }
> +    } while (ret > 0);
> +
> +    // drain all processed frames
> +    do {
> +        AVFrame *in_frame = NULL;
> +        AVFrame *out_frame = NULL;
> +        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
> +        if (out_frame) {
> +            av_assert0(in_frame == out_frame);
> +            ret = ff_filter_frame(outlink, out_frame);
> +            if (ret < 0)
> +                return ret;
> +            got_frame = 1;
> +        }
> +    } while (async_state == DAST_SUCCESS);
> +
> +    // if frame got, schedule to next filter
> +    if (got_frame)
> +        return 0;
> +
> +    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
> +        if (status == AVERROR_EOF) {
> +            int64_t out_pts = pts;
> +            ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
> +            ff_outlink_set_status(outlink, status, out_pts);
> +            return ret;
> +        }
> +    }
> +
> +    FF_FILTER_FORWARD_WANTED(outlink, inlink);
> +
> +    return 0;
> +}
> +
> +static int dnn_detect_activate(AVFilterContext *filter_ctx)
> +{
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +
> +    if (ctx->dnnctx.async)
> +        return dnn_detect_activate_async(filter_ctx);
> +    else
> +        return dnn_detect_activate_sync(filter_ctx);
> +}
> +
> +static av_cold void dnn_detect_uninit(AVFilterContext *context)
> +{
> +    DnnDetectContext *ctx = context->priv;
> +    ff_dnn_uninit(&ctx->dnnctx);
> +    free_detect_labels(ctx);
> +}
> +
> +static const AVFilterPad dnn_detect_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad dnn_detect_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_dnn_detect = {
> +    .name          = "dnn_detect",
> +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
> +    .priv_size     = sizeof(DnnDetectContext),
> +    .init          = dnn_detect_init,
> +    .uninit        = dnn_detect_uninit,
> +    .query_formats = dnn_detect_query_formats,
> +    .inputs        = dnn_detect_inputs,
> +    .outputs       = dnn_detect_outputs,
> +    .priv_class    = &dnn_detect_class,
> +    .activate      = dnn_detect_activate,
> +};
>