[FFmpeg-devel] [PATCH] movtextdec: fix handling of UTF-8 subtitles

wm4 nfxjfg at googlemail.com
Sun Mar 25 20:30:03 EEST 2018


On Sat, 24 Mar 2018 15:48:36 +0100
wm4 <nfxjfg at googlemail.com> wrote:

> Subtitles which contained styled UTF-8 subtitles (i.e. not just 7 bit
> ASCII characters) were not handled correctly. The spec mandates that
> styling start/end ranges are in "characters". It's not quite clear what
> a "character" is supposed to be, but maybe they mean unicode codepoints.
> 
> FFmpeg's decoder treated the style ranges as byte idexes, which could
> lead to UTF-8 sequences being broken, and the common code dropping the
> whole subtitle line.
> 
> Change this and count the codepoint instead. This also means that even
> if this is somehow wrong, the decoder won't break UTF-8 sequences
> anymore. The sample which led me to investigate this now appears to work
> correctly.
> ---
> https://github.com/mpv-player/mpv/issues/5675
> ---
>  libavcodec/movtextdec.c | 50 ++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c
> index bd19577724..89ac791602 100644
> --- a/libavcodec/movtextdec.c
> +++ b/libavcodec/movtextdec.c
> @@ -326,9 +326,24 @@ static const Box box_types[] = {
>  
>  const static size_t box_count = FF_ARRAY_ELEMS(box_types);
>  
> +// Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
> +static int get_utf8_length_at(const char *text, const char *text_end)
> +{
> +    const char *start = text;
> +    int err = 0;
> +    uint32_t c;
> +    GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
> +    if (err)
> +        goto error;
> +    return text - start;
> +error:
> +    return 0;
> +}
> +
>  static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
> -                        MovTextContext *m)
> +                       AVCodecContext *avctx)
>  {
> +    MovTextContext *m = avctx->priv_data;
>      int i = 0;
>      int j = 0;
>      int text_pos = 0;
> @@ -342,6 +357,8 @@ static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
>      }
>  
>      while (text < text_end) {
> +        int len;
> +
>          if (m->box_flags & STYL_BOX) {
>              for (i = 0; i < m->style_entries; i++) {
>                  if (m->s[i]->style_flag && text_pos == m->s[i]->style_end) {
> @@ -388,17 +405,24 @@ static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
>              }
>          }
>  
> -        switch (*text) {
> -        case '\r':
> -            break;
> -        case '\n':
> -            av_bprintf(buf, "\\N");
> -            break;
> -        default:
> -            av_bprint_chars(buf, *text, 1);
> -            break;
> +        len = get_utf8_length_at(text, text_end);
> +        if (len < 1) {
> +            av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
> +            len = 1;
> +        }
> +        for (i = 0; i < len; i++) {
> +            switch (*text) {
> +            case '\r':
> +                break;
> +            case '\n':
> +                av_bprintf(buf, "\\N");
> +                break;
> +            default:
> +                av_bprint_chars(buf, *text, 1);
> +                break;
> +            }
> +            text++;
>          }
> -        text++;
>          text_pos++;
>      }
>  
> @@ -507,10 +531,10 @@ static int mov_text_decode_frame(AVCodecContext *avctx,
>              }
>              m->tracksize = m->tracksize + tsmb_size;
>          }
> -        text_to_ass(&buf, ptr, end, m);
> +        text_to_ass(&buf, ptr, end, avctx);
>          mov_text_cleanup(m);
>      } else
> -        text_to_ass(&buf, ptr, end, m);
> +        text_to_ass(&buf, ptr, end, avctx);
>  
>      ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
>      av_bprint_finalize(&buf, NULL);

Pushed.


More information about the ffmpeg-devel mailing list