[FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add new graphicsub2text filter (OCR)

Wed Sep 22 05:04:37 EEST 2021

Soft Works:
> Signed-off-by: softworkz <softworkz at hotmail.com>
> ---
>  configure                        |   1 +
>  doc/filters.texi                 |  55 ++++++
>  libavfilter/Makefile             |   1 +
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/sf_graphicsub2text.c | 326 +++++++++++++++++++++++++++++++
>  5 files changed, 384 insertions(+)
>  create mode 100644 libavfilter/sf_graphicsub2text.c
> 
> diff --git a/configure b/configure
> index 37fc4c20e7..2682e51435 100755
> --- a/configure
> +++ b/configure
> @@ -3601,6 +3601,7 @@ frei0r_deps_any="libdl LoadLibrary"
>  frei0r_filter_deps="frei0r"
>  frei0r_src_filter_deps="frei0r"
>  fspp_filter_deps="gpl"
> +graphicsub2text_filter_deps="libtesseract"
>  histeq_filter_deps="gpl"
>  hqdn3d_filter_deps="gpl"
>  interlace_filter_deps="gpl"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index da463e2cc1..2b6dfbe1d4 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -25248,6 +25248,61 @@ ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple
>  @end example
>  @end itemize
>  
> + at section graphicsub2text
> +
> +Converts graphic subtitles to text subtitles by performing OCR.
> +
> +For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract).
> +Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'. 
> +The path can also be specified via filter option (see below).
> +
> +Note: These models are including the data for both OCR modes.
> +
> +Inputs:
> +- 0: Subtitles [bitmap]
> +
> +Outputs:
> +- 0: Subtitles [text]
> +
> +It accepts the following parameters:
> +
> + at table @option
> + at item ocr_mode
> +The character recognition mode to use.
> +
> +Supported OCR modes are:
> +
> + at table @var
> + at item 0, tesseract
> +This is the classic libtesseract operation mode. It is fast but less accurate than LSTM.
> + at item 1, lstm
> +Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources.
> + at item 2, both
> +Use a combination of both modes.
> + at end table
> +
> + at item tessdata_path
> +The path to a folder containing the language models to be used.
> +
> + at item language
> +The recognition language. It needs to match the first three characters of a  language model file in the tessdata path.
> +
> + at end table
> +
> +
> + at subsection Examples
> +
> + at itemize
> + at item
> +Convert DVB graphic subtitles to ASS (text) subtitles
> +
> +Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above).
> + at example
> +ffmpeg ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
> + at end example
> + at end itemize
> +
> +
>  @section graphicsub2video
>  
>  Renders graphic subtitles as video frames. 
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 39abf6d2a6..312b67982c 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -290,6 +290,7 @@ OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
>  OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
>  OBJS-$(CONFIG_GEQ_FILTER)                    += vf_geq.o
>  OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
> +OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER)        += sf_graphicsub2text.o
>  OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER)       += vf_overlay_graphicsubs.o framesync.o
>  OBJS-$(CONFIG_GRAPHMONITOR_FILTER)           += f_graphmonitor.o
>  OBJS-$(CONFIG_GRAYWORLD_FILTER)              += vf_grayworld.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 77c6379302..ee5638dc3d 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -527,6 +527,7 @@ extern const AVFilter ff_avf_showwaves;
>  extern const AVFilter ff_avf_showwavespic;
>  extern const AVFilter ff_vaf_spectrumsynth;
>  extern const AVFilter ff_sf_censor;
> +extern const AVFilter ff_sf_graphicsub2text;
>  extern const AVFilter ff_sf_show_speaker;
>  extern const AVFilter ff_sf_split_cc;
>  extern const AVFilter ff_sf_stripstyles;
> diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c
> new file mode 100644
> index 0000000000..157b76408e
> --- /dev/null
> +++ b/libavfilter/sf_graphicsub2text.c
> @@ -0,0 +1,326 @@
> +/*
> + * Copyright (c) 2021 softworkz
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * subtitle filter to convert graphical subs to text subs via OCR
> + */
> +
> +#include <tesseract/capi.h>
> +#include <libavcodec/ass.h>
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/opt.h"
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "subtitles.h"
> +#include "libavcodec/avcodec.h"
> +#include "libavutil/file.h"
> +
> +typedef struct SubOcrContext {
> +    const AVClass *class;
> +    int w, h;
> +
> +    TessBaseAPI *tapi;
> +    TessOcrEngineMode ocr_mode;
> +    char *tessdata_path;
> +    char *language;
> +
> +    int readorder_counter;
> +
> +    AVFrame *pending_frame;
> +} SubOcrContext;
> +
> +
> +static int init(AVFilterContext *ctx)
> +{
> +    SubOcrContext *s = ctx->priv;
> +    const char* tver = TessVersion();
> +    int ret;
> +
> +    s->tapi = TessBaseAPICreate();
> +
> +    if (!s->tapi || !tver || !strlen(tver)) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n");
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver);
> +
> +    ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1);
> +    if (ret < 0 ) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret);
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|");
> +    if (ret < 0 ) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    return 0;
> +}
> +
> +static void uninit(AVFilterContext *ctx)
> +{
> +    SubOcrContext *s = ctx->priv;
> +
> +    TessBaseAPIEnd(s->tapi);
> +    TessBaseAPIDelete(s->tapi);

Beware: uninit is also called on init failure, so it might be that
s->tapi is NULL or that TessBaseAPIInit4 did not succeed.

> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    AVFilterFormats *formats, *formats2;
> +    AVFilterLink *inlink = ctx->inputs[0];
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE };
> +    static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE };
> +    int ret;
> +
> +    /* set input format */
> +    formats = ff_make_format_list(in_fmts);
> +    if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0)
> +        return ret;
> +
> +    /* set output format */
> +    formats2 = ff_make_format_list(out_fmts);
> +    if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    SubOcrContext *s = ctx->priv;
> +
> +    if (s->w <= 0 || s->h <= 0) {
> +        s->w = inlink->w;
> +        s->h = inlink->h;
> +    }
> +    return 0;
> +}
> +
> +static int config_output(AVFilterLink *outlink)
> +{
> +    const AVFilterContext *ctx  = outlink->src;
> +    SubOcrContext *s = ctx->priv;
> +
> +    outlink->format = AV_SUBTITLE_FMT_ASS;
> +    outlink->w = s->w;
> +    outlink->h = s->h;
> +
> +    return 0;
> +}
> +
> +static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area)
> +{
> +    uint8_t gray_pal[256];
> +    const size_t img_size = area->buf[0]->size;
> +    const uint8_t* img    = area->buf[0]->data;
> +    uint8_t* gs_img       = av_malloc(img_size);
> +
> +    if (!gs_img)
> +        return NULL;
> +
> +    for (unsigned i = 0; i < 256; i++) {
> +        const uint8_t *col = (uint8_t*)&area->pal[i];
> +        const int val      = (int)col[3] * FFMAX3(col[0], col[1], col[2]);
> +        gray_pal[i]        = (uint8_t)(val >> 8);
> +    }
> +
> +    for (unsigned i = 0; i < img_size; i++)
> +        gs_img[i] = 255 - gray_pal[img[i]];
> +
> +    return gs_img;
> +}
> +
> +static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area)
> +{
> +    SubOcrContext *s = ctx->priv;
> +    char *ocr_text = NULL;
> +    int ret;
> +    uint8_t *gs_img = create_grayscale_image(ctx, area);
> +
> +    if (!gs_img)
> +        return AVERROR(ENOMEM);
> +
> +    TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]);
> +    TessBaseAPISetSourceResolution(s->tapi, 70);
> +
> +    ret = TessBaseAPIRecognize(s->tapi, NULL);
> +    if (ret == 0)
> +        ocr_text = TessBaseAPIGetUTF8Text(s->tapi);
> +
> +    if (!ocr_text) {
> +        av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret);
> +        area->ass = av_strdup("");
> +    }
> +    else {
> +        size_t len = strlen(ocr_text);
> +
> +        if (len > 0 && ocr_text[len - 1] == '\n')
> +            ocr_text[len - 1] = 0;
> +
> +        av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text);
> +
> +        area->ass = av_strdup(ocr_text);
> +
> +        TessDeleteText(ocr_text);
> +    }
> +
> +    av_freep(&gs_img);
> +    av_buffer_unref(&area->buf[0]);
> +    area->type = AV_SUBTITLE_FMT_ASS;
> +
> +    return 0;
> +}
> +
> +static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    SubOcrContext *s = ctx->priv;
> +    AVFilterLink *outlink = inlink->dst->outputs[0];
> +    int ret, frame_sent = 0;
> +
> +    if (s->pending_frame) {
> +        const uint64_t pts_diff = frame->subtitle_pts - s->pending_frame->subtitle_pts;
> +        s->pending_frame->subtitle_end_time = (uint32_t)(pts_diff / 1000);
> +
> +        ret = ff_filter_frame(outlink, s->pending_frame);
> +        s->pending_frame = NULL;
> +        if (ret < 0)
> +            return  ret;
> +
> +        frame_sent = 1;
> +
> +        if (frame->num_subtitle_areas == 0) {
> +            // No need to forward this empty frame
> +            av_frame_unref(frame);

Leak.

> +            return 0;
> +        }
> +    }
> +
> +    av_frame_make_writable(frame);
> +
> +    if (!frame)
> +        return AVERROR(ENOMEM);

Wrong check; and leak.

> +
> +    frame->format = AV_SUBTITLE_FMT_ASS;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "filter_frame sub_pts: %"PRIu64", start_time: %d, end_time: %d, num_areas: %d\n", 
> +        frame->subtitle_pts, frame->subtitle_start_time, frame->subtitle_end_time, frame->num_subtitle_areas);
> +
> +    if (frame->num_subtitle_areas > 1 &&
> +        frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) {
> +
> +        for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++)
> +            FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]);
> +    }
> +    
> +    for (unsigned i = 0; i < frame->num_subtitle_areas; i++) {
> +        char *tmp;
> +        AVSubtitleArea *area = frame->subtitle_areas[i];
> +
> +        ret = convert_area(ctx, area);
> +        if (ret < 0)
> +            return ret;
> +
> +        if (strlen(area->ass)) {
> +            tmp = area->ass;
> +
> +            if (i == 0)
> +                area->ass = ff_ass_get_dialog(s->readorder_counter++, 0, "Default", NULL, tmp);
> +            else
> +                area->ass = av_asprintf("\\N%s", tmp);
> +
> +            av_free(tmp);
> +        }
> +    }
> +
> +    if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= 30000) {

Where does this number come from?

> +        // Can't send it without end time, wait for the next frame to determine the end_display time
> +        s->pending_frame = frame;
> +
> +        if (frame_sent)
> +            return 0;
> +
> +        // To keep all going, send an empty frame instead
> +        frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS);
> +        if (!frame)
> +            return AVERROR(ENOMEM);
> +
> +        av_frame_copy_props(frame, s->pending_frame);
> +        frame->subtitle_end_time = 1;
> +    }
> +
> +    return ff_filter_frame(outlink, frame);
> +}
> +
> +#define OFFSET(x) offsetof(SubOcrContext, x)
> +#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
> +
> +static const AVOption graphicsub2text_options[] = {
> +    { "ocr_mode",       "set ocr mode",                  OFFSET(ocr_mode),      AV_OPT_TYPE_INT,    {.i64=OEM_TESSERACT_ONLY},          OEM_TESSERACT_ONLY, 2, FLAGS, "ocr_mode" },
> +    {   "tesseract",    "classic tesseract ocr",         0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_ONLY},          0,                  0, FLAGS, "ocr_mode" },
> +    {   "lstm",         "lstm (ML based)",               0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_LSTM_ONLY},               0,                  0, FLAGS, "ocr_mode" },
> +    {   "both",         "use both models combined",      0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_LSTM_COMBINED}, 0,                  0, FLAGS, "ocr_mode" },
> +    { "tessdata_path",  "path to tesseract data",        OFFSET(tessdata_path), AV_OPT_TYPE_STRING, {.str = NULL},                      0,                  0, FLAGS, NULL   },
> +    { "language",       "ocr language",                  OFFSET(language),      AV_OPT_TYPE_STRING, {.str = "eng"},                     0,                  0, FLAGS, NULL   },
> +    { NULL },
> +};
> +
> +AVFILTER_DEFINE_CLASS(graphicsub2text);
> +
> +static const AVFilterPad inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_SUBTITLE,
> +        .filter_frame = filter_frame,
> +        .config_props = config_input,
> +    },
> +};
> +
> +static const AVFilterPad outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_SUBTITLE,
> +        .config_props  = config_output,
> +    },
> +};
> +
> +/*
> + * Example:
> + * ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
> + */
> +const AVFilter ff_sf_graphicsub2text = {
> +    .name          = "graphicsub2text",
> +    .description   = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"),
> +    .init          = init,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .priv_size     = sizeof(SubOcrContext),
> +    .priv_class    = &graphicsub2text_class,
> +    FILTER_INPUTS(inputs),
> +    FILTER_OUTPUTS(outputs),
> +};
>