[FFmpeg-devel] [PATCH v2 17/26] avfilter/graphicsub2text: Add new graphicsub2text filter (OCR)

Thu Jan 20 04:48:27 EET 2022

From: softworkz <softworkz at hotmail.com>

Signed-off-by: softworkz <softworkz at hotmail.com>
---
 configure                        |    1 +
 doc/filters.texi                 |   55 ++
 libavfilter/Makefile             |    2 +
 libavfilter/allfilters.c         |    1 +
 libavfilter/sf_graphicsub2text.c | 1132 ++++++++++++++++++++++++++++++
 5 files changed, 1191 insertions(+)
 create mode 100644 libavfilter/sf_graphicsub2text.c

diff --git a/configure b/configure
index c1d2bc41c2..ee7afffb05 100755
--- a/configure
+++ b/configure
@@ -3665,6 +3665,7 @@ frei0r_filter_deps="frei0r"
 frei0r_src_filter_deps="frei0r"
 fspp_filter_deps="gpl"
 gblur_vulkan_filter_deps="vulkan spirv_compiler"
+graphicsub2text_filter_deps="libtesseract"
 hflip_vulkan_filter_deps="vulkan spirv_compiler"
 histeq_filter_deps="gpl"
 hqdn3d_filter_deps="gpl"
diff --git a/doc/filters.texi b/doc/filters.texi
index 265a267e9d..9311714f82 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -25897,6 +25897,61 @@ ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple
 @end example
 @end itemize
 
+ at section graphicsub2text
+
+Converts graphic subtitles to text subtitles by performing OCR.
+
+For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract).
+Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'.
+The path can also be specified via filter option (see below).
+
+Note: These models are including the data for both OCR modes.
+
+Inputs:
+- 0: Subtitles [bitmap]
+
+Outputs:
+- 0: Subtitles [text]
+
+It accepts the following parameters:
+
+ at table @option
+ at item ocr_mode
+The character recognition mode to use.
+
+Supported OCR modes are:
+
+ at table @var
+ at item 0, tesseract
+This is the classic libtesseract operation mode. It is fast but less accurate than LSTM.
+ at item 1, lstm
+Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources.
+ at item 2, both
+Use a combination of both modes.
+ at end table
+
+ at item tessdata_path
+The path to a folder containing the language models to be used.
+
+ at item language
+The recognition language. It needs to match the first three characters of a  language model file in the tessdata path.
+
+ at end table
+
+
+ at subsection Examples
+
+ at itemize
+ at item
+Convert DVB graphic subtitles to ASS (text) subtitles
+
+Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above).
+ at example
+ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=delay_when_no_duration=1" -c:s ass -y output.mkv
+ at end example
+ at end itemize
+
+
 @section graphicsub2video
 
 Renders graphic subtitles as video frames.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index c6a4a4f5ae..ead3e38507 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -299,6 +299,8 @@ OBJS-$(CONFIG_GBLUR_VULKAN_FILTER)           += vf_gblur_vulkan.o vulkan.o vulka
 OBJS-$(CONFIG_GEQ_FILTER)                    += vf_geq.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
 OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER)       += vf_overlaygraphicsubs.o framesync.o
+OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER)        += sf_graphicsub2text.o
+OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER)       += vf_overlaygraphicsubs.o framesync.o
 OBJS-$(CONFIG_GRAPHMONITOR_FILTER)           += f_graphmonitor.o
 OBJS-$(CONFIG_GRAYWORLD_FILTER)              += vf_grayworld.o
 OBJS-$(CONFIG_GREYEDGE_FILTER)               += vf_colorconstancy.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 50498e8ec4..34576016ce 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -548,6 +548,7 @@ extern const AVFilter ff_avf_showwaves;
 extern const AVFilter ff_avf_showwavespic;
 extern const AVFilter ff_vaf_spectrumsynth;
 extern const AVFilter ff_sf_censor;
+extern const AVFilter ff_sf_graphicsub2text;
 extern const AVFilter ff_sf_showspeaker;
 extern const AVFilter ff_sf_splitcc;
 extern const AVFilter ff_sf_stripstyles;
diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c
new file mode 100644
index 0000000000..9b413d314e
--- /dev/null
+++ b/libavfilter/sf_graphicsub2text.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (c) 2021 softworkz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * subtitle filter to convert graphical subs to text subs via OCR
+ */
+
+#include <tesseract/capi.h>
+#include <libavutil/ass_internal.h>
+
+#include "drawutils.h"
+#include "libavutil/opt.h"
+#include "subtitles.h"
+
+#include "libavcodec/elbg.h"
+
+enum {
+    RFLAGS_NONE         = 0,
+    RFLAGS_HALIGN       = 1 << 0,
+    RFLAGS_VALIGN       = 1 << 1,
+    RFLAGS_FBOLD        = 1 << 2,
+    RFLAGS_FITALIC      = 1 << 3,
+    RFLAGS_FUNDERLINE   = 1 << 4,
+    RFLAGS_FONT         = 1 << 5,
+    RFLAGS_FONTSIZE     = 1 << 6,
+    RFLAGS_COLOR        = 1 << 7,
+    RFLAGS_OUTLINECOLOR = 1 << 8,
+    RFLAGS_ALL = RFLAGS_HALIGN | RFLAGS_VALIGN | RFLAGS_FBOLD | RFLAGS_FITALIC | RFLAGS_FUNDERLINE |
+                RFLAGS_FONT | RFLAGS_FONTSIZE | RFLAGS_COLOR | RFLAGS_OUTLINECOLOR,
+};
+
+typedef struct SubOcrContext {
+    const AVClass *class;
+    int w, h;
+
+    TessBaseAPI *tapi;
+    TessOcrEngineMode ocr_mode;
+    char *tessdata_path;
+    char *language;
+    int preprocess_images;
+    int dump_bitmaps;
+    int delay_when_no_duration;
+    int recognize;
+    double font_size_factor;
+
+    int readorder_counter;
+
+    AVFrame *pending_frame;
+    AVBufferRef *subtitle_header;
+    AVBPrint buffer;
+
+    // Color Quantization Fields
+    struct ELBGContext *ctx;
+    AVLFG lfg;
+    int *codeword;
+    int *codeword_closest_codebook_idxs;
+    int *codebook;
+    int r_idx, g_idx, b_idx, a_idx;
+    int64_t last_subtitle_pts;
+} SubOcrContext;
+
+typedef struct OcrImageProps {
+    int background_color_index;
+    int fill_color_index;
+
+} OcrImageProps;
+
+static int64_t ms_to_avtb(int64_t ms)
+{
+    return av_rescale_q(ms, (AVRational){ 1, 1000 }, AV_TIME_BASE_Q);
+}
+
+static int create_ass_header(AVFilterContext* ctx)
+{
+    SubOcrContext* s = ctx->priv;
+
+    if (!(s->w && s->h)) {
+        av_log(ctx, AV_LOG_WARNING, "create_ass_header: no width and height specified!\n");
+        s->w = ASS_DEFAULT_PLAYRESX;
+        s->h = ASS_DEFAULT_PLAYRESY;
+    }
+
+    char* subtitle_header_text = avpriv_ass_get_subtitle_header_full(s->w, s->h, ASS_DEFAULT_FONT, ASS_DEFAULT_FONT_SIZE,
+        ASS_DEFAULT_COLOR, ASS_DEFAULT_COLOR, ASS_DEFAULT_BACK_COLOR, ASS_DEFAULT_BACK_COLOR, ASS_DEFAULT_BOLD,
+        ASS_DEFAULT_ITALIC, ASS_DEFAULT_UNDERLINE, ASS_DEFAULT_BORDERSTYLE, ASS_DEFAULT_ALIGNMENT, 0);
+
+    if (!subtitle_header_text)
+        return AVERROR(ENOMEM);
+
+    s->subtitle_header = av_buffer_create((uint8_t*)subtitle_header_text, strlen(subtitle_header_text) + 1, NULL, NULL, 0);
+
+    if (!s->subtitle_header)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static int init(AVFilterContext *ctx)
+{
+    SubOcrContext *s = ctx->priv;
+    const char* tver = TessVersion();
+    uint8_t rgba_map[4];
+    int ret;
+
+    s->tapi = TessBaseAPICreate();
+
+    if (!s->tapi || !tver || !strlen(tver)) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n");
+        return AVERROR(ENOSYS);
+    }
+
+    av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver);
+
+    ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1);
+    if (ret < 0 ) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret);
+        return AVERROR(ENOSYS);
+    }
+
+    ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|");
+    if (ret < 0 ) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret);
+        return AVERROR(EINVAL);
+    }
+
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    ff_fill_rgba_map(&rgba_map[0], AV_PIX_FMT_RGB32);
+
+    s->r_idx = rgba_map[0]; // R
+    s->g_idx = rgba_map[1]; // G
+    s->b_idx = rgba_map[2]; // B
+    s->a_idx = rgba_map[3]; // A
+
+    av_lfg_init(&s->lfg, 123456789);
+
+    return 0;
+}
+
+static void uninit(AVFilterContext *ctx)
+{
+    SubOcrContext *s = ctx->priv;
+
+    av_buffer_unref(&s->subtitle_header);
+    av_bprint_finalize(&s->buffer, NULL);
+
+    if (s->tapi) {
+        TessBaseAPIEnd(s->tapi);
+        TessBaseAPIDelete(s->tapi);
+    }
+
+    avpriv_elbg_free(&s->ctx);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    AVFilterFormats *formats, *formats2;
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE };
+    static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE };
+    int ret;
+
+    /* set input format */
+    formats = ff_make_format_list(in_fmts);
+    if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0)
+        return ret;
+
+    /* set output format */
+    formats2 = ff_make_format_list(out_fmts);
+    if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0)
+        return ret;
+
+    return 0;
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    SubOcrContext *s = ctx->priv;
+
+    if (s->w <= 0 || s->h <= 0) {
+        s->w = inlink->w;
+        s->h = inlink->h;
+    }
+
+    return create_ass_header(ctx);
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    const AVFilterContext *ctx  = outlink->src;
+    SubOcrContext *s = ctx->priv;
+
+    outlink->format = AV_SUBTITLE_FMT_ASS;
+    outlink->w = s->w;
+    outlink->h = s->h;
+    outlink->time_base = inlink->time_base;
+    outlink->frame_rate = inlink->frame_rate;
+
+    return 0;
+}
+
+static void free_subtitle_area(AVSubtitleArea *area)
+{
+    for (unsigned n = 0; n < FF_ARRAY_ELEMS(area->buf); n++)
+        av_buffer_unref(&area->buf[n]);
+
+    av_freep(&area->text);
+    av_freep(&area->ass);
+    av_free(area);
+
+}
+
+static AVSubtitleArea *copy_subtitle_area(const AVSubtitleArea *src)
+{
+    AVSubtitleArea *dst = av_mallocz(sizeof(AVSubtitleArea));
+
+    if (!dst)
+        return NULL;
+
+    dst->x         =  src->x;
+    dst->y         =  src->y;
+    dst->w         =  src->w;
+    dst->h         =  src->h;
+    dst->nb_colors =  src->nb_colors;
+    dst->type      =  src->type;
+    dst->flags     =  src->flags;
+
+    for (unsigned i = 0; i < AV_NUM_BUFFER_POINTERS; i++) {
+        if (src->h > 0 && src->w > 0 && src->buf[i]) {
+            dst->buf[0] = av_buffer_ref(src->buf[i]);
+            if (!dst->buf[i])
+                return NULL;
+
+            const int ret = av_buffer_make_writable(&dst->buf[i]);
+            if (ret < 0)
+                return NULL;
+
+            dst->linesize[i] = src->linesize[i];
+        }
+    }
+
+    memcpy(&dst->pal[0], &src->pal[0], sizeof(src->pal[0]) * 256);
+
+
+    return dst;
+}
+
+static int quantize_image_colors(SubOcrContext *const s, AVSubtitleArea *subtitle_area)
+{
+    const int num_quantized_colors = 3;
+    int k, ret;
+    const int codeword_length = subtitle_area->w * subtitle_area->h;
+    uint8_t *src_data = subtitle_area->buf[0]->data;
+
+    if (subtitle_area->nb_colors <= num_quantized_colors) {
+        av_log(s, AV_LOG_DEBUG, "No need to quantize colors. Color count: %d\n", subtitle_area->nb_colors);
+        return 0;
+    }
+
+    // Convert palette to grayscale
+    for (int i = 0; i < subtitle_area->nb_colors; i++) {
+        uint8_t *color        = (uint8_t *)&subtitle_area->pal[i];
+        const uint8_t average = (uint8_t)(((int)color[s->r_idx] + color[s->g_idx] + color[s->b_idx]) / 3);
+        color[s->b_idx]       = average;
+        color[s->g_idx]       = average;
+        color[s->r_idx]       = average;
+    }
+
+    /* Re-Initialize */
+    s->codeword = av_realloc_f(s->codeword, codeword_length, 4 * sizeof(*s->codeword));
+    if (!s->codeword)
+        return AVERROR(ENOMEM);
+
+    s->codeword_closest_codebook_idxs = av_realloc_f(s->codeword_closest_codebook_idxs,
+        codeword_length, sizeof(*s->codeword_closest_codebook_idxs));
+    if (!s->codeword_closest_codebook_idxs)
+        return AVERROR(ENOMEM);
+
+    s->codebook = av_realloc_f(s->codebook, num_quantized_colors, 4 * sizeof(*s->codebook));
+    if (!s->codebook)
+        return AVERROR(ENOMEM);
+
+    /* build the codeword */
+    k = 0;
+    for (int i = 0; i < subtitle_area->h; i++) {
+        uint8_t *p = src_data;
+        for (int j = 0; j < subtitle_area->w; j++) {
+            const uint8_t *color = (uint8_t *)&subtitle_area->pal[*p];
+            s->codeword[k++] = color[s->b_idx];
+            s->codeword[k++] = color[s->g_idx];
+            s->codeword[k++] = color[s->r_idx];
+            s->codeword[k++] = color[s->a_idx];
+            p++;
+        }
+        src_data += subtitle_area->linesize[0];
+    }
+
+    /* compute the codebook */
+    ret = avpriv_elbg_do(&s->ctx, s->codeword, 4, codeword_length, s->codebook,
+        num_quantized_colors, 1, s->codeword_closest_codebook_idxs, &s->lfg, 0);
+    if (ret < 0)
+        return ret;
+
+    /* Write Palette */
+    for (int i = 0; i < num_quantized_colors; i++) {
+        subtitle_area->pal[i] = s->codebook[i*4+3] << 24  |
+                    (s->codebook[i*4+2] << 16) |
+                    (s->codebook[i*4+1] <<  8) |
+                    (s->codebook[i*4  ] <<  0);
+    }
+
+
+    av_log(s, AV_LOG_DEBUG, "Quantized colors from %d to %d\n", subtitle_area->nb_colors, num_quantized_colors);
+
+    subtitle_area->nb_colors = num_quantized_colors;
+    src_data = subtitle_area->buf[0]->data;
+
+    /* Write Image */
+    k = 0;
+    for (int i = 0; i < subtitle_area->h; i++) {
+        uint8_t *p = src_data;
+        for (int j = 0; j < subtitle_area->w; j++, p++) {
+            p[0] = (uint8_t)s->codeword_closest_codebook_idxs[k++];
+        }
+
+        src_data += subtitle_area->linesize[0];
+    }
+
+    return ret;
+}
+
+#define MEASURE_LINE_COUNT 6
+
+static uint8_t get_background_color_index(SubOcrContext *const s, const AVSubtitleArea *subtitle_area)
+{
+    const int linesize = subtitle_area->linesize[0];
+    int index_counts[256] = {0};
+    const unsigned int line_offsets[MEASURE_LINE_COUNT] = {
+        0,
+        linesize,
+        2 * linesize,
+        (subtitle_area->h - 3) * linesize,
+        (subtitle_area->h - 2) * linesize,
+        (subtitle_area->h - 1) * linesize
+    };
+
+    const uint8_t *src_data = subtitle_area->buf[0]->data;
+    const uint8_t tl = src_data[0];
+    const uint8_t tr = src_data[subtitle_area->w - 1];
+    const uint8_t bl = src_data[(subtitle_area->h - 1) * linesize + 0];
+    const uint8_t br = src_data[(subtitle_area->h - 1) * linesize + subtitle_area->w - 1];
+    uint8_t max_index = 0;
+    int max_count;
+
+    // When all corner pixels are equal, assume that as background color
+    if (tl == tr == bl == br || subtitle_area->h < 6)
+        return tl;
+
+    for (unsigned int i = 0; i < MEASURE_LINE_COUNT; i++) {
+        uint8_t *p = subtitle_area->buf[0]->data + line_offsets[i];
+        for (int k = 0; k < subtitle_area->w; k++)
+            index_counts[p[k]]++;
+    }
+
+    max_count = index_counts[0];
+
+    for (uint8_t i = 1; i < subtitle_area->nb_colors; i++) {
+        if (index_counts[i] > max_count) {
+            max_count = index_counts[i];
+            max_index = i;
+        }
+    }
+
+    return max_index;
+}
+
+static uint8_t get_text_color_index(SubOcrContext *const s, const AVSubtitleArea *subtitle_area, const uint8_t bg_color_index, uint8_t *outline_color_index)
+{
+    const int linesize = subtitle_area->linesize[0];
+    int index_counts[256] = {0};
+    uint8_t last_index = bg_color_index;
+    int max_count, min_req_count;
+    uint8_t max_index = 0;
+
+    for (int i = 3; i < subtitle_area->h - 3; i += 5) {
+        const uint8_t *p = subtitle_area->buf[0]->data + ((ptrdiff_t)linesize * i);
+        for (int k = 0; k < subtitle_area->w; k++) {
+            const uint8_t cur_index = p[k];
+
+            // When color hasn't changed, continue
+            if (cur_index == last_index)
+                continue;
+
+            if (cur_index != bg_color_index)
+                index_counts[cur_index]++;
+
+            last_index = cur_index;
+        }
+    }
+
+    max_count = index_counts[0];
+
+    for (uint8_t i = 1; i < subtitle_area->nb_colors; i++) {
+        if (index_counts[i] > max_count) {
+            max_count = index_counts[i];
+            max_index = i;
+        }
+    }
+
+    min_req_count = max_count / 3;
+
+    for (uint8_t i = 1; i < subtitle_area->nb_colors; i++) {
+        if (index_counts[i] < min_req_count)
+            index_counts[i] = 0;
+    }
+
+    *outline_color_index = max_index;
+
+    index_counts[max_index] = 0;
+    max_count = 0;
+
+    for (uint8_t i = 0; i < subtitle_area->nb_colors; i++) {
+        if (index_counts[i] > max_count) {
+            max_count = index_counts[i];
+            max_index = i;
+        }
+    }
+
+    if (*outline_color_index == max_index)
+        *outline_color_index = 255;
+
+    return max_index;
+}
+
+static void make_image_binary(SubOcrContext *const s, AVSubtitleArea *subtitle_area, const uint8_t text_color_index)
+{
+    for (int i = 0; i < subtitle_area->nb_colors; i++) {
+
+        if (i != text_color_index)
+            subtitle_area->pal[i] = 0xffffffff;
+        else
+            subtitle_area->pal[i] = 0xff000000;
+    }
+}
+
+static int get_crop_region(SubOcrContext *const s, const AVSubtitleArea *subtitle_area, uint8_t text_color_index, int *x, int *y, int *w, int *h)
+{
+    const int linesize = subtitle_area->linesize[0];
+    int max_y = 0, max_x = 0;
+    int min_y = subtitle_area->h - 1, min_x = subtitle_area->w - 1;
+
+    for (int i = 0; i < subtitle_area->h; i += 3) {
+        const uint8_t *p = subtitle_area->buf[0]->data + ((ptrdiff_t)linesize * i);
+        for (int k = 0; k < subtitle_area->w; k += 2) {
+            if (p[k] == text_color_index) {
+                min_y = FFMIN(min_y, i);
+                min_x = FFMIN(min_x, k);
+                max_y = FFMAX(max_y, i);
+                max_x = FFMAX(max_x, k);
+            }
+        }
+    }
+
+    if (max_y <= min_y || max_x <= min_x) {
+        av_log(s, AV_LOG_WARNING, "Unable to detect crop region\n");
+        *x = 0;
+        *y = 0;
+        *w = subtitle_area->w;
+        *h = subtitle_area->h;
+    }    else {
+        *x = FFMAX(min_x - 10, 0);
+        *y = FFMAX(min_y - 10, 0);
+        *w = FFMIN(max_x + 10 - *x, (subtitle_area->w - *x));
+        *h = FFMIN(max_y + 10 - *y, (subtitle_area->h - *y));
+    }
+
+    return 0;
+}
+
+static int crop_area_bitmap(SubOcrContext *const s, AVSubtitleArea *subtitle_area, int x, int y, int w, int h)
+{
+    const int linesize = subtitle_area->linesize[0];
+    AVBufferRef *dst = av_buffer_allocz(h * w);
+    uint8_t *d;
+
+    if (!dst)
+        return AVERROR(ENOMEM);
+
+    d = dst->data;
+
+    for (int i = y; i < y + h; i++) {
+        const uint8_t *p = subtitle_area->buf[0]->data + ((ptrdiff_t)linesize * i);
+        for (int k = x; k < x + w; k++) {
+            *d = p[k];
+            d++;
+        }
+    }
+
+    subtitle_area->w = w;
+    subtitle_area->h = h;
+    subtitle_area->x += x;
+    subtitle_area->y += y;
+    subtitle_area->linesize[0] = w;
+    av_buffer_replace(&subtitle_area->buf[0], dst);
+
+    av_buffer_unref(&dst);
+    return 0;
+}
+
+#define R 0
+#define G 1
+#define B 2
+#define A 3
+
+static int print_code(AVBPrint *buf, int in_code, const char *fmt, ...)
+{
+    va_list vl;
+
+    if (!in_code)
+        av_bprint_chars(buf, '{', 1);
+
+    va_start(vl, fmt);
+    av_vbprintf(buf, fmt, vl);
+    va_end(vl);
+
+    return 1;
+}
+
+static int end_code(AVBPrint *buf, int in_code)
+{
+    if (in_code)
+        av_bprint_chars(buf, '}', 1);
+    return 0;
+}
+
+static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area, int invert)
+{
+    uint8_t gray_pal[256];
+    const size_t img_size = area->buf[0]->size;
+    const uint8_t* img    = area->buf[0]->data;
+    uint8_t* gs_img       = av_malloc(img_size);
+
+    if (!gs_img)
+        return NULL;
+
+    for (unsigned i = 0; i < 256; i++) {
+        const uint8_t *col = (uint8_t*)&area->pal[i];
+        const int val      = (int)col[3] * FFMAX3(col[0], col[1], col[2]);
+        gray_pal[i]        = (uint8_t)(val >> 8);
+    }
+
+    if (invert)
+        for (unsigned i = 0; i < img_size; i++)
+            gs_img[i]   = 255 - gray_pal[img[i]];
+    else
+        for (unsigned i = 0; i < img_size; i++)
+            gs_img[i]   = gray_pal[img[i]];
+
+    return gs_img;
+}
+
+static uint8_t* create_bitmap_image(AVFilterContext *ctx, AVSubtitleArea *area, const uint8_t text_color_index)
+{
+    const size_t img_size = area->buf[0]->size;
+    const uint8_t* img    = area->buf[0]->data;
+    uint8_t* gs_img       = av_malloc(img_size);
+
+    if (!gs_img)
+        return NULL;
+
+    for (unsigned i = 0; i < img_size; i++) {
+        if (img[i] == text_color_index)
+            gs_img[i]   = 0;
+        else
+            gs_img[i]   = 255;
+    }
+
+    return gs_img;
+}
+
+static void png_save(AVFilterContext *ctx, const char *filename, AVSubtitleArea *area)
+{
+    int x, y;
+    int v;
+    FILE *f;
+    char fname[40];
+    const uint8_t *data = area->buf[0]->data;
+
+    snprintf(fname, sizeof(fname), "%s.ppm", filename);
+
+    f = fopen(fname, "wb");
+    if (!f) {
+        perror(fname);
+        return;
+    }
+    fprintf(f, "P6\n"
+            "%d %d\n"
+            "%d\n",
+            area->w, area->h, 255);
+    for(y = 0; y < area->h; y++) {
+        for(x = 0; x < area->w; x++) {
+            const uint8_t index = data[y * area->linesize[0] + x];
+            v = (int)area->pal[index];
+            putc(v >> 16 & 0xff, f);
+            putc(v >> 8 & 0xff, f);
+            putc(v >> 0 & 0xff, f);
+        }
+    }
+
+    fclose(f);
+}
+
+static int get_max_index(int score[256])
+{
+    int max_val = 0, max_index = 0;
+
+    for (int i = 0; i < 256; i++) {
+        if (score[i] > max_val) {
+            max_val = score[i];
+            max_index = i;
+        }
+    }
+
+    return max_index;
+}
+
+static int get_word_colors(AVFilterContext *ctx, TessResultIterator* ri, const AVSubtitleArea* area, const AVSubtitleArea* original_area,
+                           uint8_t bg_color_index, uint8_t text_color_index, uint8_t outline_color_index,
+                           uint32_t* bg_color, uint32_t* text_color, uint32_t* outline_color)
+{
+    int left = 0, top = 0, right = 0, bottom = 0, ret;
+    int bg_score[256] = {0}, text_score[256] = {0}, outline_score[256] = {0};
+    int max_index;
+
+    ret = TessPageIteratorBoundingBox((TessPageIterator*)ri, RIL_WORD, &left, &top, &right, &bottom);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_WARNING, "get_word_colors: IteratorBoundingBox failed: %d\n", ret);
+        return  ret;
+    }
+
+    if (left >= area->w || right >= area->w || top >= area->h || bottom >= area->h) {
+        av_log(ctx, AV_LOG_WARNING, "get_word_colors: word bounding box (l: %d, t: %d r: %d, b: %d) out of image bounds (%dx%d)\n", left,top, right, bottom, area->w, area->h);
+        return  AVERROR(EINVAL);
+    }
+
+    for (int y = top; y < bottom; y += 3) {
+        uint8_t *p = area->buf[0]->data + (y * area->linesize[0]) + left;
+        uint8_t *porig = original_area->buf[0]->data + (y * original_area->linesize[0]) + left;
+        uint8_t current_index = 255;
+
+        for (int x = left; x < right; x++, p++, porig++) {
+
+            if (*p == current_index) {
+                if (*p == bg_color_index)
+                    bg_score[*porig]++;
+                if (*p == text_color_index)
+                    text_score[*porig]++;
+                if (*p == outline_color_index)
+                    outline_score[*porig]++;
+            }
+
+            current_index = *p;
+        }
+    }
+
+    max_index = get_max_index(bg_score);
+    if (bg_score[max_index] > 0)
+        *bg_color = original_area->pal[max_index];
+
+    max_index = get_max_index(text_score);
+    if (text_score[max_index] > 0)
+        *text_color = original_area->pal[max_index];
+
+    max_index = get_max_index(outline_score);
+    if (outline_score[max_index] > 0)
+        *outline_color = original_area->pal[max_index];
+
+    return 0;
+}
+
+static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area, const AVFrame *frame, const unsigned area_index, int *margin_v)
+{
+    SubOcrContext *s = ctx->priv;
+    char *ocr_text = NULL;
+    int ret = 0;
+    uint8_t *gs_img;
+    uint8_t bg_color_index;
+    uint8_t text_color_index = 255;
+    uint8_t outline_color_index = 255;
+    char filename[32];
+    AVSubtitleArea *original_area = copy_subtitle_area(area);
+
+    if (!original_area)
+        return AVERROR(ENOMEM);
+
+    if (area->w < 6 || area->h < 6) {
+        area->ass = NULL;
+        goto exit;
+    }
+
+    if (s->dump_bitmaps) {
+        snprintf(filename, sizeof(filename), "graphicsub2text_%"PRId64"_%d_original", frame->subtitle_timing.start_pts, area_index);
+        png_save(ctx, filename, area);
+    }
+
+    if (s->preprocess_images) {
+        ret = quantize_image_colors(s, area);
+        if (ret < 0)
+            goto exit;
+        if (s->dump_bitmaps && original_area->nb_colors != area->nb_colors) {
+            snprintf(filename, sizeof(filename), "graphicsub2text_%"PRId64"_%d_quantized", frame->subtitle_timing.start_pts, area_index);
+            png_save(ctx, filename, area);
+        }
+    }
+
+    bg_color_index = get_background_color_index(s, area);
+
+    if (s->preprocess_images) {
+        int x, y, w, h;
+
+        for (int i = 0; i < area->nb_colors; ++i) {
+            av_log(s, AV_LOG_DEBUG, "Color #%d: %0.8X\n", i, area->pal[i]);
+        }
+
+        text_color_index = get_text_color_index(s, area, bg_color_index, &outline_color_index);
+
+        get_crop_region(s, area, text_color_index, &x, &y, &w, &h);
+
+        if ((ret = crop_area_bitmap(s, area, x, y, w, h) < 0))
+            goto exit;
+
+        if ((ret = crop_area_bitmap(s, original_area, x, y, w, h) < 0))
+            goto exit;
+
+        make_image_binary(s, area, text_color_index);
+
+        if (s->dump_bitmaps) {
+            snprintf(filename, sizeof(filename), "graphicsub2text_%"PRId64"_%d_preprocessed", frame->subtitle_timing.start_pts, area_index);
+            png_save(ctx, filename, area);
+        }
+
+        gs_img = create_bitmap_image(ctx, area, text_color_index);
+    } else
+        gs_img = create_grayscale_image(ctx, area, 1);
+
+    if (!gs_img) {
+        ret = AVERROR(ENOMEM);
+        goto exit;
+    }
+
+    area->type = AV_SUBTITLE_FMT_ASS;
+    TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]);
+
+    TessBaseAPISetSourceResolution(s->tapi, 72);
+
+    ret = TessBaseAPIRecognize(s->tapi, NULL);
+    if (ret == 0)
+        ocr_text = TessBaseAPIGetUTF8Text(s->tapi);
+
+    if (!ocr_text || !strlen(ocr_text)) {
+        av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret);
+        area->ass = NULL;
+
+        goto exit;
+    }
+
+    const size_t len = strlen(ocr_text);
+    if (len > 0 && ocr_text[len - 1] == '\n')
+        ocr_text[len - 1] = 0;
+
+    av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text);
+
+    area->ass = av_strdup(ocr_text);
+    TessDeleteText(ocr_text);
+
+    // End of simple recognition
+
+    if (s->recognize != RFLAGS_NONE) {
+        TessResultIterator* ri = 0;
+        const TessPageIteratorLevel level = RIL_WORD;
+        int cur_is_bold = 0, cur_is_italic = 0, cur_is_underlined = 0, cur_pointsize = 0;
+        uint32_t cur_text_color = 0, cur_bg_color = 0, cur_outline_color = 0;
+
+        char *cur_font_name = NULL;
+        int valign = 0; // 0: bottom, 4: top, 8 middle
+        int halign = 2; // 1: left, 2: center, 3: right
+        int in_code = 0;
+        double font_factor = (0.000666 * (s->h - 480) + 1) * s->font_size_factor;
+
+        av_freep(&area->ass);
+        av_bprint_clear(&s->buffer);
+
+        ri = TessBaseAPIGetIterator(s->tapi);
+
+        // Horizontal Alignment
+        if (s->w && s->recognize & RFLAGS_HALIGN) {
+            int left_margin = area->x;
+            int right_margin = s->w - area->x - area->w;
+            double relative_diff = ((double)left_margin - right_margin) / s->w;
+
+            if (FFABS(relative_diff) < 0.1)
+                halign = 2; // center
+            else if (relative_diff > 0)
+                halign = 3; // right
+            else
+                halign = 1; // left
+        }
+
+        // Vertical Alignment
+        if (s->h && frame->height && s->recognize & RFLAGS_VALIGN) {
+            int left = 0, top = 0, right = 0, bottom = 0;
+
+            TessPageIteratorBoundingBox((TessPageIterator*)ri, RIL_TEXTLINE, &left, &top, &right, &bottom);
+            av_log(s, AV_LOG_DEBUG, "RIL_TEXTLINE - TOP: %d  BOTTOM: %d HEIGHT: %d\n", top, bottom, bottom - top);
+
+            TessPageIteratorBoundingBox((TessPageIterator*)ri, RIL_BLOCK, &left, &top, &right, &bottom);
+
+            const int vertical_pos = area->y + area->h / 2;
+            if (vertical_pos < s->h / 3) {
+                *margin_v = area->y + top;
+                valign = 4;
+            }
+            else if (vertical_pos < s->h / 3 * 2) {
+                *margin_v = 0;
+                valign = 8;
+            } else {
+                *margin_v = frame->height - area->y - area->h;
+                valign = 0;
+            }
+        }
+
+        if (*margin_v < 0)
+            *margin_v = 0;
+
+        // Set alignment when not default (2)
+        if ((valign | halign) != 2)
+            in_code = print_code(&s->buffer, in_code, "\\a%d", valign | halign);
+
+        do {
+            int is_bold, is_italic, is_underlined, is_monospace, is_serif, is_smallcaps, pointsize, font_id;
+            char* word;
+            const char *font_name = TessResultIteratorWordFontAttributes(ri, &is_bold, &is_italic, &is_underlined, &is_monospace, &is_serif, &is_smallcaps, &pointsize, &font_id);
+            uint32_t text_color = 0, bg_color = 0, outline_color = 0;
+
+            if (cur_is_underlined && !is_underlined && s->recognize & RFLAGS_FUNDERLINE)
+                in_code = print_code(&s->buffer, in_code, "\\u0");
+
+            if (cur_is_bold && !is_bold && s->recognize & RFLAGS_FBOLD)
+                in_code = print_code(&s->buffer, in_code, "\\b0");
+
+            if (cur_is_italic && !is_italic && s->recognize & RFLAGS_FITALIC)
+                in_code = print_code(&s->buffer, in_code, "\\i0");
+
+
+            if (TessPageIteratorIsAtBeginningOf((TessPageIterator*)ri, RIL_TEXTLINE) && !TessPageIteratorIsAtBeginningOf((TessPageIterator*)ri, RIL_BLOCK)) {
+                in_code = end_code(&s->buffer, in_code);
+                av_bprintf(&s->buffer, "\\N");
+            }
+
+            if (get_word_colors(ctx, ri, area, original_area, bg_color_index, text_color_index, outline_color_index, &bg_color, &text_color, &outline_color) == 0) {
+
+                if (text_color > 0 && cur_text_color != text_color && s->recognize & RFLAGS_COLOR) {
+                    const uint8_t* tval = (uint8_t*)&text_color;
+                    const int color = (int)tval[R] << 16 | (int)tval[G] << 8 | tval[B];
+
+                    in_code = print_code(&s->buffer, in_code, "\\1c&H%0.6X&", color);
+                    if (tval[A] != 255)
+                        in_code = print_code(&s->buffer, in_code, "\\1a&H%0.2X&", 255 - tval[A]);
+                }
+
+                if (outline_color > 0 && cur_outline_color != outline_color && s->recognize & RFLAGS_OUTLINECOLOR) {
+                    const uint8_t* tval = (uint8_t*)&outline_color;
+                    const int color = (int)tval[R] << 16 | (int)tval[G] << 8 | tval[B];
+
+                    in_code = print_code(&s->buffer, in_code, "\\3c&H%0.6X&\\bord2", color);
+                    in_code = print_code(&s->buffer, in_code, "\\3a&H%0.2X&", FFMIN(255 - tval[A], 30));
+                }
+
+                cur_text_color = text_color;
+                cur_outline_color = outline_color;
+            }
+
+            if (font_name && strlen(font_name) && s->recognize & RFLAGS_FONT) {
+                if (!cur_font_name || !strlen(cur_font_name) || strcmp(cur_font_name, font_name) != 0) {
+                    char *sanitized_font_name = av_strireplace(font_name, "_", " ");
+                    if (!sanitized_font_name) {
+                        ret = AVERROR(ENOMEM);
+                        goto exit;
+                    }
+
+                    in_code = print_code(&s->buffer, in_code, "\\fn%s", sanitized_font_name);
+                    av_freep(&sanitized_font_name);
+
+                    if (cur_font_name)
+                        av_freep(&cur_font_name);
+                    cur_font_name = av_strdup(font_name);
+                    if (!cur_font_name) {
+                        ret = AVERROR(ENOMEM);
+                        goto exit;
+                    }
+                }
+            }
+
+            if (pointsize != cur_pointsize && s->recognize & RFLAGS_FONTSIZE) {
+                av_log(s, AV_LOG_DEBUG, "pointsize - pointsize: %d\n", pointsize);
+                in_code = print_code(&s->buffer, in_code, "\\fs%d", (int)(pointsize * font_factor));
+                cur_pointsize = pointsize;
+            }
+
+            if (is_italic && !cur_is_italic && s->recognize & RFLAGS_FITALIC)
+                in_code = print_code(&s->buffer, in_code, "\\i1");
+
+            if (is_bold && !cur_is_bold && s->recognize & RFLAGS_FBOLD)
+                in_code = print_code(&s->buffer, in_code, "\\b1");
+
+            if (is_underlined && !cur_is_underlined && s->recognize & RFLAGS_FUNDERLINE)
+                in_code = print_code(&s->buffer, in_code, "\\u1");
+
+            in_code = end_code(&s->buffer, in_code);
+
+            cur_is_underlined = is_underlined;
+            cur_is_bold = is_bold;
+            cur_is_italic = is_italic;
+
+            if (!TessPageIteratorIsAtBeginningOf((TessPageIterator*)ri, RIL_TEXTLINE))
+                av_bprint_chars(&s->buffer, ' ', 1);
+
+            word = TessResultIteratorGetUTF8Text(ri, level);
+            av_bprint_append_data(&s->buffer, word, strlen(word));
+            TessDeleteText(word);
+
+        } while (TessResultIteratorNext(ri, level));
+
+        if (!av_bprint_is_complete(&s->buffer))
+            ret = AVERROR(ENOMEM);
+        else {
+            av_log(ctx, AV_LOG_VERBOSE, "ASS Result: %s\n", s->buffer.str);
+            area->ass = av_strdup(s->buffer.str);
+        }
+
+        TessResultIteratorDelete(ri);
+        av_freep(&cur_font_name);
+    }
+
+exit:
+    free_subtitle_area(original_area);
+    av_freep(&gs_img);
+    av_buffer_unref(&area->buf[0]);
+    area->type = AV_SUBTITLE_FMT_ASS;
+
+    return ret;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+    AVFilterContext *ctx = inlink->dst;
+    SubOcrContext *s = ctx->priv;
+    AVFilterLink *outlink = inlink->dst->outputs[0];
+    int ret, frame_sent = 0;
+
+    if (s->pending_frame && !frame->repeat_sub) {
+        const int64_t pts_diff = frame->subtitle_timing.start_pts - s->pending_frame->subtitle_timing.start_pts;
+
+        if (pts_diff == 0) {
+            // This is just a repetition of the previous frame, ignore it
+            av_frame_free(&frame);
+            return 0;
+        }
+
+        s->pending_frame->subtitle_timing.duration = pts_diff;
+
+        if ((ret = av_buffer_replace(&s->pending_frame->subtitle_header, s->subtitle_header)) < 0)
+            return ret;
+
+        ret = ff_filter_frame(outlink, s->pending_frame);
+        s->pending_frame = NULL;
+        if (ret < 0)
+            return  ret;
+
+        frame_sent = 1;
+        s->last_subtitle_pts = frame->subtitle_timing.start_pts;
+    }
+
+    if (frame->repeat_sub) {
+        // Ignore repeated frame
+        av_frame_free(&frame);
+        return 0;
+    }
+
+    s->last_subtitle_pts = frame->subtitle_timing.start_pts;
+
+    ret = av_frame_make_writable(frame);
+
+    if (ret < 0) {
+        av_frame_free(&frame);
+        return ret;
+    }
+
+    frame->format = AV_SUBTITLE_FMT_ASS;
+
+    av_log(ctx, AV_LOG_VERBOSE, "filter_frame sub_pts: %"PRIu64", duration: %"PRIu64", num_areas: %d\n",
+        frame->subtitle_timing.start_pts, frame->subtitle_timing.duration, frame->num_subtitle_areas);
+
+    if (frame->num_subtitle_areas > 1 &&
+        frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) {
+
+        for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++)
+            FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]);
+    }
+
+    for (int i = 0; i < frame->num_subtitle_areas; i++) {
+        AVSubtitleArea *area = frame->subtitle_areas[i];
+        int margin_v = 0;
+
+        ret = convert_area(ctx, area, frame, i, &margin_v);
+        if (ret < 0)
+            return ret;
+
+        if (area->ass && area->ass[0] != '\0') {
+
+            const int layer = s->recognize ? i : 0;
+            char *tmp = area->ass;
+            area->ass = avpriv_ass_get_dialog_ex(s->readorder_counter++, layer, "Default", NULL, 0, 0, margin_v, tmp);
+            av_free(tmp);
+        }
+    }
+
+    // When decoders can't determine the end time, they are setting it either to UINT32_NAX
+    // or 30s (dvbsub).
+    if (s->delay_when_no_duration && frame->subtitle_timing.duration >= ms_to_avtb(29000)) {
+        // Can't send it without end time, wait for the next frame to determine the end_display time
+        s->pending_frame = frame;
+
+        if (frame_sent)
+            return 0;
+
+        // To keep all going, send an empty frame instead
+        frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS);
+        if (!frame)
+            return AVERROR(ENOMEM);
+
+        av_frame_copy_props(frame, s->pending_frame);
+        frame->subtitle_timing.start_pts = 0;
+        frame->subtitle_timing.duration = 1;
+        frame->repeat_sub = 1;
+    }
+
+    if ((ret = av_buffer_replace(&frame->subtitle_header, s->subtitle_header)) < 0)
+        return ret;
+
+    return ff_filter_frame(outlink, frame);
+}
+
+#define OFFSET(x) offsetof(SubOcrContext, x)
+#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
+
+static const AVOption graphicsub2text_options[] = {
+    { "delay_when_no_duration", "delay output when duration is unknown", OFFSET(delay_when_no_duration), AV_OPT_TYPE_BOOL,   { .i64 = 0 },                         0,                  1,       FLAGS, NULL },
+    { "dump_bitmaps",           "save processed bitmaps as .ppm",        OFFSET(dump_bitmaps),           AV_OPT_TYPE_BOOL,   { .i64 = 0 },                         0,                  1,       FLAGS, NULL },
+    { "font_size_factor",       "font size adjustment factor",           OFFSET(font_size_factor),       AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 },                       0.2,                5,       FLAGS, NULL },
+    { "language",               "ocr language",                          OFFSET(language),               AV_OPT_TYPE_STRING, { .str = "eng" },                     0,                  0,       FLAGS, NULL },
+    { "ocr_mode",               "set ocr mode",                          OFFSET(ocr_mode),               AV_OPT_TYPE_INT,    { .i64=OEM_TESSERACT_ONLY },          OEM_TESSERACT_ONLY, 2,       FLAGS, "ocr_mode" },
+    {   "tesseract",            "classic tesseract ocr",                 0,                              AV_OPT_TYPE_CONST,  { .i64=OEM_TESSERACT_ONLY },          0,                  0,       FLAGS, "ocr_mode" },
+    {   "lstm",                 "lstm (ML based)",                       0,                              AV_OPT_TYPE_CONST,  { .i64=OEM_LSTM_ONLY},                0,                  0,       FLAGS, "ocr_mode" },
+    {   "both",                 "use both models combined",              0,                              AV_OPT_TYPE_CONST,  { .i64=OEM_TESSERACT_LSTM_COMBINED }, 0,                  0,       FLAGS, "ocr_mode" },
+    { "preprocess_images",      "reduce colors, remove outlines",        OFFSET(preprocess_images),      AV_OPT_TYPE_BOOL,   { .i64 = 1 },                         0,                  1,       FLAGS, NULL },
+    { "recognize",              "detect fonts, styles and colors",       OFFSET(recognize),              AV_OPT_TYPE_FLAGS,  { .i64 = RFLAGS_ALL},                  0,                  INT_MAX, FLAGS, "reco_flags" },
+        { "none",         "no format detection",  0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_NONE         }, 0, 0, FLAGS, "reco_flags" },
+        { "halign",       "horizontal alignment", 0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_HALIGN       }, 0, 0, FLAGS, "reco_flags" },
+        { "valign",       "vertical alignment",   0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_VALIGN       }, 0, 0, FLAGS, "reco_flags" },
+        { "bold",         "font bold",            0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_FBOLD        }, 0, 0, FLAGS, "reco_flags" },
+        { "italic",       "font italic",          0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_FITALIC      }, 0, 0, FLAGS, "reco_flags" },
+        { "underline",    "font underline",       0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_FUNDERLINE   }, 0, 0, FLAGS, "reco_flags" },
+        { "font",         "font name",            0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_FONT         }, 0, 0, FLAGS, "reco_flags" },
+        { "fontsize",     "font size",            0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_FONTSIZE     }, 0, 0, FLAGS, "reco_flags" },
+        { "color",        "font color",           0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_COLOR        }, 0, 0, FLAGS, "reco_flags" },
+        { "outlinecolor", "outline color",        0, AV_OPT_TYPE_CONST, { .i64 = RFLAGS_OUTLINECOLOR }, 0, 0, FLAGS, "reco_flags" },
+    { "tessdata_path",          "path to tesseract data",                OFFSET(tessdata_path),          AV_OPT_TYPE_STRING, { .str = NULL },                      0,                  0,       FLAGS, NULL },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(graphicsub2text);
+
+static const AVFilterPad inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_SUBTITLE,
+        .filter_frame = filter_frame,
+        .config_props = config_input,
+    },
+};
+
+static const AVFilterPad outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_SUBTITLE,
+        .config_props  = config_output,
+    },
+};
+
+const AVFilter ff_sf_graphicsub2text = {
+    .name          = "graphicsub2text",
+    .description   = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"),
+    .init          = init,
+    .uninit        = uninit,
+    .priv_size     = sizeof(SubOcrContext),
+    .priv_class    = &graphicsub2text_class,
+    FILTER_INPUTS(inputs),
+    FILTER_OUTPUTS(outputs),
+    FILTER_QUERY_FUNC(query_formats),
+};
-- 
ffmpeg-codebot