[FFmpeg-devel] matroskadec: support S_TEXT/WEBVTT

Leo Izen leo.izen at gmail.com
Wed Dec 25 18:45:38 EET 2024


On 12/25/24 3:46 AM, Wang Bin wrote:
> On 12/19/24 3:59 AM, Wang Bin wrote:
>>> based on Hendrik Leppkes's fork
>>>
>>>
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel at ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>>
>> This patch has a big web of gotos. This can get very confusing. Is there
>> any way to change it so there's if blocks or other static functions to
>> call? Gotos should be used very sparingly and mostly to goto the end of
>> a block where cleanup happens.
>>
>> - Leo Izen (Traneptora)


> From 5bd4a6f56231c3160be6c2d322f5d1081d7b3622 Mon Sep 17 00:00:00 2001
> From: wang-bin <wbsecg1 at gmail.com>
> Date: Tue, 17 Dec 2024 23:21:57 +0800
> Subject: [PATCH] matroskadec: support S_TEXT/WEBVTT
> 
> fix ticket #5641
> ---
>   libavformat/matroska.c    |   1 +
>   libavformat/matroskadec.c | 102 ++++++++++++++++++++++----------------
>   2 files changed, 60 insertions(+), 43 deletions(-)
> 
> diff --git a/libavformat/matroska.c b/libavformat/matroska.c
> index d0ecfbeb6a..23c4ad6949 100644
> --- a/libavformat/matroska.c
> +++ b/libavformat/matroska.c
> @@ -63,6 +63,7 @@ const CodecTags ff_mkv_codec_tags[]={
>       {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
>       {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
>       {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
> +    {"S_TEXT/WEBVTT"        , AV_CODEC_ID_WEBVTT},
> 
>       {"S_TEXT/UTF8"      , AV_CODEC_ID_SUBRIP},
>       {"S_TEXT/UTF8"      , AV_CODEC_ID_TEXT},
> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
> index 0e150f9138..11af2cface 100644
> --- a/libavformat/matroskadec.c
> +++ b/libavformat/matroskadec.c
> @@ -3761,68 +3761,83 @@ static int matroska_parse_prores(MatroskaTrack 
> *track,
>       return 0;
>   }
> 
> +static int matroska_webvtt_read_line(uint8_t** begin, uint8_t* end)

static ptrdiff_t matroska_webvtt_read_line(const uint8_t **begin, const 
uint8_t *end)

> +{
> +    uint8_t *p = *begin;
> +    int len = -1;

ptrdiff_t len = -1;


> +
> +    while (p < end) {
> +        if (*p == '\r' || *p == '\n') {
> +            len = p - *begin;
> +            if (*p == '\r')
> +                p++;

This will cause AVERROR_INVALIDDATA to return, if a \r is found without 
a \n after it. Is this intentional?


> +            break;
> +        }
> +        p++;
> +    }
> +
> +    if (p >= end || *p != '\n')
> +        return AVERROR_INVALIDDATA;
> +    p++;
> +    *begin = p;


These two lines can be replaced with: *begin = p + 1;



> +    return len;
> +}
> +
>   static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
>                                    MatroskaTrack *track,
>                                    AVStream *st,
>                                    uint8_t *data, int data_len,
> +                                 MatroskaBlockMore *blockmore, int 
> nb_blockmore,
>                                    uint64_t timecode,
>                                    uint64_t duration,
>                                    int64_t pos)
>   {
>       AVPacket *pkt = matroska->pkt;
>       uint8_t *id, *settings, *text, *buf;
> -    int id_len, settings_len, text_len;
> +    int id_len = -1, settings_len = -1, text_len = -1;

ptrdiff_t id_len, settings_le, text_len;

This are never read uninitialized.


>       uint8_t *p, *q;
>       int err;
> +    const int webm_style = !strncmp(track->codec_id, "D_WEBVTT/", 9);
> 
>       if (data_len <= 0)
>           return AVERROR_INVALIDDATA;
> 
> -    p = data;
> -    q = data + data_len;
> -
> -    id = p;
> -    id_len = -1;
> -    while (p < q) {
> -        if (*p == '\r' || *p == '\n') {
> -            id_len = p - id;
> -            if (*p == '\r')
> -                p++;
> -            break;
> +    if (webm_style) {
> +        p = data;
> +        q = data + data_len;
> +        id = p;
> +        id_len = matroska_webvtt_read_line(&p, q);
> +        if (id_len < 0)
> +            return AVERROR_INVALIDDATA;
> +        settings = p;
> +        settings_len = matroska_webvtt_read_line(&p, q);
> +        if (settings_len < 0)
> +            return AVERROR_INVALIDDATA;
> +        text = p;
> +        text_len = data + data_len - p;
> +        while (text_len > 0) {
> +            const int len = text_len - 1;
> +            const uint8_t c = p[len];
> +            if (c != '\r' && c != '\n')
> +                break;
> +            text_len = len;
>           }

You can replace this loop with:

while (text_len-- > 0 && (p[text_len] == '\r' || p[text_len] == '\n')) {
     // empty
}
text_len++;

The lazy && ensures that text_len-- is evaluated before p[text_len] is 
dereferenced.

> -        p++;
> -    }
> -
> -    if (p >= q || *p != '\n')
> -        return AVERROR_INVALIDDATA;
> -    p++;
> -
> -    settings = p;
> -    settings_len = -1;
> -    while (p < q) {
> -        if (*p == '\r' || *p == '\n') {
> -            settings_len = p - settings;
> -            if (*p == '\r')
> -                p++;
> -            break;
> +    } else {
> +        if (nb_blockmore > 0) {

} else if (nb_blockmore > 0) {

> +            p = blockmore->additional.data;
> +            q = p + blockmore->additional.size;
> +            settings = p;
> +            settings_len = matroska_webvtt_read_line(&p, q);
> +            if (settings_len < 0)
> +                return AVERROR_INVALIDDATA;
> +            id = p;
> +            id_len = matroska_webvtt_read_line(&p, q);
> +            if (id_len < 0)
> +                return AVERROR_INVALIDDATA;
>           }
> -        p++;
> -    }
> -
> -    if (p >= q || *p != '\n')
> -        return AVERROR_INVALIDDATA;
> -    p++;
> -
> -    text = p;
> -    text_len = q - p;
> -    while (text_len > 0) {
> -        const int len = text_len - 1;
> -        const uint8_t c = p[len];
> -        if (c != '\r' && c != '\n')
> -            break;
> -        text_len = len;
> +        text = data;
> +        text_len = data_len;
>       }
> -
>       if (text_len <= 0)
>           return AVERROR_INVALIDDATA;
> 
> @@ -4215,6 +4230,7 @@ static int 
> matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
>           } else if (st->codecpar->codec_id == AV_CODEC_ID_WEBVTT) {
>               res = matroska_parse_webvtt(matroska, track, st,
>                                           out_data, out_size,
> +                                        blockmore, nb_blockmore,
>                                           timecode, lace_duration,
>                                           pos);
>               if (!buf)


Overall it's a lot cleaner and easier to read though.

- Leo Izen (Traneptora)



More information about the ffmpeg-devel mailing list