[FFmpeg-devel] [PATCH 2/2] avformat/movenc: add support for TTML muxing

Mon Jul 12 16:20:02 EEST 2021

On Tue, 22 Jun 2021, Jan Ekström wrote:

> From: Jan Ekström <jan.ekstrom at 24i.com>
>
> Includes basic support for both the ISMV ('dfxp') and MP4 ('stpp')
> methods. This initial version also foregoes fragmentation support
> as this eases the initial review.

Hmm, I'm not sure I understand here, this seems to add at least some coe 
in mov_flush_fragment, so there's some initial support for fragmentation 
present still - can you elaborate?

> Signed-off-by: Jan Ekström <jan.ekstrom at 24i.com>
> ---
> libavformat/Makefile      |   2 +-
> libavformat/isom.h        |   3 +
> libavformat/movenc.c      | 180 +++++++++++++++++++++++++++-
> libavformat/movenc.h      |   6 +
> libavformat/movenc_ttml.c | 243 ++++++++++++++++++++++++++++++++++++++
> libavformat/movenc_ttml.h |  31 +++++
> 6 files changed, 462 insertions(+), 3 deletions(-)
> create mode 100644 libavformat/movenc_ttml.c
> create mode 100644 libavformat/movenc_ttml.h
>
> diff --git a/libavformat/Makefile b/libavformat/Makefile
> index c9ef564523..931ad4ac45 100644
> --- a/libavformat/Makefile
> +++ b/libavformat/Makefile
> @@ -337,7 +337,7 @@ OBJS-$(CONFIG_MOV_DEMUXER)               += mov.o mov_chan.o mov_esds.o \
>                                             qtpalette.o replaygain.o
> OBJS-$(CONFIG_MOV_MUXER)                 += movenc.o av1.o avc.o hevc.o vpcc.o \
>                                             movenchint.o mov_chan.o rtp.o \
> -                                            movenccenc.o rawutils.o
> +                                            movenccenc.o movenc_ttml.o rawutils.o
> OBJS-$(CONFIG_MP2_MUXER)                 += rawenc.o
> OBJS-$(CONFIG_MP3_DEMUXER)               += mp3dec.o replaygain.o
> OBJS-$(CONFIG_MP3_MUXER)                 += mp3enc.o rawenc.o id3v2enc.o
> diff --git a/libavformat/isom.h b/libavformat/isom.h
> index ac1b3f3d56..34a58c79b7 100644
> --- a/libavformat/isom.h
> +++ b/libavformat/isom.h
> @@ -387,4 +387,7 @@ static inline enum AVCodecID ff_mov_get_lpcm_codec_id(int bps, int flags)
>     return ff_get_pcm_codec_id(bps, flags & 1, flags & 2, flags & 4 ? -1 : 0);
> }
> 
> +#define MOV_ISMV_TTML_TAG MKTAG('d', 'f', 'x', 'p')
> +#define MOV_MP4_TTML_TAG  MKTAG('s', 't', 'p', 'p')
> +
> #endif /* AVFORMAT_ISOM_H */
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index 04f3e94158..d4efb6217f 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -56,6 +56,8 @@
> #include "hevc.h"
> #include "rtpenc.h"
> #include "mov_chan.h"
> +#include "movenc_ttml.h"
> +#include "ttmlenc.h"
> #include "vpcc.h"
> 
> static const AVOption options[] = {
> @@ -120,6 +122,7 @@ static const AVClass flavor ## _muxer_class = {\
> };
> 
> static int get_moov_size(AVFormatContext *s);
> +static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt);
> 
> static int utf8len(const uint8_t *b)
> {
> @@ -1788,7 +1791,29 @@ static int mov_write_subtitle_tag(AVIOContext *pb, MOVTrack *track)
>
>     if (track->par->codec_id == AV_CODEC_ID_DVD_SUBTITLE)
>         mov_write_esds_tag(pb, track);
> -    else if (track->par->extradata_size)
> +    else if (track->par->codec_id == AV_CODEC_ID_TTML) {
> +        switch (track->par->codec_tag) {
> +        case MOV_ISMV_TTML_TAG:
> +            // ye olde ISMV dfxp requires no extradata.

Nit: I'd prefer a more formal/serious wording in the comment than "ye 
olde" :P

> +            break;
> +        case MOV_MP4_TTML_TAG:
> +            // As specified in 14496-30, XMLSubtitleSampleEntry
> +            // Namespace
> +            avio_put_str(pb, "http://www.w3.org/ns/ttml");
> +            // Empty schema_location
> +            avio_w8(pb, 0);
> +            // Empty auxiliary_mime_types
> +            avio_w8(pb, 0);
> +            break;
> +        default:
> +            av_log(NULL, AV_LOG_ERROR,
> +                   "Unknown codec tag '%s' utilized for TTML stream with "
> +                   "index %d (track id %d)!\n",
> +                   av_fourcc2str(track->par->codec_tag), track->st->index,
> +                   track->track_id);
> +            return AVERROR(EINVAL);
> +        }
> +    } else if (track->par->extradata_size)
>         avio_write(pb, track->par->extradata, track->par->extradata_size);
>
>     if (track->mode == MODE_MP4 &&
> @@ -5254,6 +5279,71 @@ static int mov_flush_fragment_interleaving(AVFormatContext *s, MOVTrack *track)
>     return 0;
> }
> 
> +static int mov_write_squashed_packet(AVFormatContext *s, MOVTrack *track)
> +{
> +    AVPacket *squashed_packet = ((MOVMuxContext *)s->priv_data)->pkt;

Nit: Maybe spell out the intermediate MOVMuxContext pointer to a separate 
variable for clarity, even if it's used only once.

> +    int ret = AVERROR_BUG;
> +
> +    switch (track->st->codecpar->codec_id) {
> +    case AV_CODEC_ID_TTML:
> +        {
> +            int we_had_packets = !!track->squashed_packet_queue;

Nit: We don't really need the strict 0/1 value of we_had_packets here, so 
we don't need the double negation. And maybe drop the "we_" prefix?

> +
> +            if ((ret = ff_mov_generate_squashed_ttml_packet(s, track, squashed_packet)) < 0) {
> +                goto finish_squash;
> +            }
> +
> +            // We have generated a padding packet (no actual input packets in
> +            // queue) and its duration is zero. Skipping writing it.
> +            if (!we_had_packets && squashed_packet->duration == 0) {
> +                goto finish_squash;
> +            }
> +
> +            track->end_reliable = 1;
> +            break;

Nit: Odd double indentation; in other places I think we place the extra 
braces at the end of e.g. "case AV_CODEC_ID_TTML: {" and the closing one 
on the same indentation level as the case.

> +        }
> +    default:
> +        ret = AVERROR(EINVAL);
> +        goto finish_squash;
> +    }
> +
> +    squashed_packet->stream_index = track->st->index;
> +
> +    ret = mov_write_single_packet(s, squashed_packet);
> +
> +finish_squash:
> +    if (!track->squashed_packet_queue) {
> +        track->packet_queue_start_ts = track->packet_queue_end_ts = AV_NOPTS_VALUE;
> +    }
> +    av_packet_unref(squashed_packet);
> +
> +    return ret;
> +}
> +
> +static int mov_write_squashed_packets(AVFormatContext *s)
> +{
> +    MOVMuxContext *mov = s->priv_data;
> +
> +    for (int i = 0; i < s->nb_streams; i++) {
> +        MOVTrack *track = &mov->tracks[i];
> +        int ret = AVERROR_BUG;
> +
> +        if (track->squash_fragment_samples_to_one && !track->entry) {
> +            if ((ret = mov_write_squashed_packet(s, track)) < 0) {
> +                av_log(s, AV_LOG_ERROR,
> +                       "Failed to write squashed packet for %s stream with "
> +                       " index %d and track id %d. Error: %s\n",

Nit: You have a space both at the end of the previous line and on the next 
one too.

> +                       avcodec_get_name(track->st->codecpar->codec_id),
> +                       track->st->index, track->track_id,
> +                       av_err2str(ret));
> +                return ret;
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> static int mov_flush_fragment(AVFormatContext *s, int force)
> {
>     MOVMuxContext *mov = s->priv_data;
> @@ -5265,6 +5355,11 @@ static int mov_flush_fragment(AVFormatContext *s, int force)
>     if (!(mov->flags & FF_MOV_FLAG_FRAGMENT))
>         return 0;
> 
> +    // Check if we have any tracks that require squashing.
> +    // In that case, we'll have to write the packet here.
> +    if ((ret = mov_write_squashed_packets(s)) < 0)
> +        return ret;
> +
>     // Try to fill in the duration of the last packet in each stream
>     // from queued packets in the interleave queues. If the flushing
>     // of fragments was triggered automatically by an AVPacket, we
> @@ -5729,7 +5824,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>     trk->cluster[trk->entry].entries          = samples_in_chunk;
>     trk->cluster[trk->entry].dts              = pkt->dts;
>     trk->cluster[trk->entry].pts              = pkt->pts;
> -    if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
> +    if (!trk->squash_fragment_samples_to_one &&
> +        !trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
>         if (!trk->frag_discont) {
>             /* First packet of a new fragment. We already wrote the duration
>              * of the last packet of the previous fragment based on track_duration,
> @@ -6022,6 +6118,42 @@ static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>             }
>         }
> 
> +        if (trk->squash_fragment_samples_to_one) {
> +            /*
> +             * If the track has to have its samples squashed into one sample,
> +             * we just take it into the track's queue.
> +             * This will then be utilized as the samples get written in either
> +             * mov_flush_fragment or when the mux is finalized in
> +             * mov_write_trailer.
> +             */
> +            int ret = AVERROR_BUG;
> +            int64_t compared_end_ts = pkt->duration >= 0 ?
> +                                      (pkt->pts + pkt->duration) : pkt->pts;
> +
> +            if (pkt->pts == AV_NOPTS_VALUE) {
> +                av_log(s, AV_LOG_ERROR,
> +                       "Packets without a valid presentation timestamp are "
> +                       "not supported with packet squashing!\n");
> +                return AVERROR(EINVAL);
> +            }
> +
> +            trk->packet_queue_start_ts =
> +                trk->packet_queue_start_ts == AV_NOPTS_VALUE ?
> +                pkt->pts : FFMIN(trk->packet_queue_start_ts, pkt->pts);
> +
> +            trk->packet_queue_end_ts =
> +                FFMAX(trk->packet_queue_end_ts, compared_end_ts);
> +
> +            if ((ret = avpriv_packet_list_put(&trk->squashed_packet_queue,
> +                                              &trk->squashed_packet_queue_end,
> +                                              pkt, av_packet_ref, 0)) < 0) {
> +                return ret;
> +            }
> +
> +            return 0;
> +        }
> +
> +
>         if (trk->mode == MODE_MOV && trk->par->codec_type == AVMEDIA_TYPE_VIDEO) {
>             AVPacket *opkt = pkt;
>             int reshuffle_ret, ret;
> @@ -6300,6 +6432,11 @@ static void mov_free(AVFormatContext *s)
>
>         ff_mov_cenc_free(&mov->tracks[i].cenc);
>         ffio_free_dyn_buf(&mov->tracks[i].mdat_buf);
> +
> +        if (mov->tracks[i].squashed_packet_queue) {
> +            avpriv_packet_list_free(&(mov->tracks[i].squashed_packet_queue),
> +                                    &(mov->tracks[i].squashed_packet_queue_end));
> +        }
>     }
>
>     av_freep(&mov->tracks);
> @@ -6580,6 +6717,7 @@ static int mov_init(AVFormatContext *s)
>         track->start_cts  = AV_NOPTS_VALUE;
>         track->end_pts    = AV_NOPTS_VALUE;
>         track->dts_shift  = AV_NOPTS_VALUE;
> +        track->packet_queue_start_ts = track->packet_queue_end_ts = AV_NOPTS_VALUE;
>         if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
>             if (track->tag == MKTAG('m','x','3','p') || track->tag == MKTAG('m','x','3','n') ||
>                 track->tag == MKTAG('m','x','4','p') || track->tag == MKTAG('m','x','4','n') ||
> @@ -6690,6 +6828,36 @@ static int mov_init(AVFormatContext *s)
>             }
>         } else if (st->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
>             track->timescale = st->time_base.den;
> +
> +            if (track->par->codec_id == AV_CODEC_ID_TTML) {
> +                /* 14496-30 requires us to use a single sample per fragment
> +                   for TTML, for which we define a per-track flag.
> +
> +                   We set the flag in case we are receiving TTML paragraphs
> +                   from the input, in other words in case we are not doing
> +                   stream copy. */
> +                track->squash_fragment_samples_to_one =
> +                    ff_is_ttml_stream_paragraph_based(track->par);
> +
> +                if (mov->flags & FF_MOV_FLAG_FRAGMENT &&
> +                    track->squash_fragment_samples_to_one) {
> +                    av_log(s, AV_LOG_ERROR,
> +                           "Fragmentation is not currently supported for "
> +                           "TTML in MP4/ISMV (track synchronization between "
> +                           "subtitles and other media is not yet implemented)!\n");
> +                    return AVERROR(EINVAL);
> +                }
> +
> +                if (track->mode == MODE_MP4 &&
> +                    track->par->codec_tag == MOV_ISMV_TTML_TAG &&
> +                    s->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
> +                    av_log(s, AV_LOG_ERROR,
> +                           "ISMV style TTML support with the 'dfxp' tag in MP4 "
> +                           "is not officially supported, add "
> +                           "'-strict unofficial' if you want to use it.\n");
> +                    return AVERROR_EXPERIMENTAL;
> +                }
> +            }
>         } else if (st->codecpar->codec_type == AVMEDIA_TYPE_DATA) {
>             track->timescale = st->time_base.den;
>         } else {
> @@ -7035,6 +7203,11 @@ static int mov_write_trailer(AVFormatContext *s)
>         }
>     }
> 
> +    // Check if we have any tracks that require squashing.
> +    // In that case, we'll have to write the packet here.
> +    if ((res = mov_write_squashed_packets(s)) < 0)
> +        return res;
> +
>     // If there were no chapters when the header was written, but there
>     // are chapters now, write them in the trailer.  This only works
>     // when we are not doing fragments.
> @@ -7179,6 +7352,8 @@ static const AVCodecTag codec_mp4_tags[] = {
>     { AV_CODEC_ID_MOV_TEXT,        MKTAG('t', 'x', '3', 'g') },
>     { AV_CODEC_ID_BIN_DATA,        MKTAG('g', 'p', 'm', 'd') },
>     { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') },
> +    { AV_CODEC_ID_TTML,            MOV_MP4_TTML_TAG          },
> +    { AV_CODEC_ID_TTML,            MOV_ISMV_TTML_TAG         },
>     { AV_CODEC_ID_NONE,               0 },

Is this a typo when you have both tags here, or is it to allow muxing 
content with both tags into mp4?

> };
> #if CONFIG_MP4_MUXER || CONFIG_PSP_MUXER
> @@ -7187,6 +7362,7 @@ static const AVCodecTag *const mp4_codec_tags_list[] = { codec_mp4_tags, NULL };
> 
> static const AVCodecTag codec_ism_tags[] = {
>     { AV_CODEC_ID_WMAPRO      , MKTAG('w', 'm', 'a', ' ') },
> +    { AV_CODEC_ID_TTML        , MOV_ISMV_TTML_TAG         },
>     { AV_CODEC_ID_NONE        ,    0 },
> };
> 
> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
> index af1ea0bce6..9036e42f09 100644
> --- a/libavformat/movenc.h
> +++ b/libavformat/movenc.h
> @@ -26,6 +26,7 @@
> 
> #include "avformat.h"
> #include "movenccenc.h"
> +#include "libavcodec/packet_internal.h"
> 
> #define MOV_FRAG_INFO_ALLOC_INCREMENT 64
> #define MOV_INDEX_CLUSTER_SIZE 1024
> @@ -164,6 +165,11 @@ typedef struct MOVTrack {
>     int pal_done;
>
>     int is_unaligned_qt_rgb;
> +
> +    unsigned int squash_fragment_samples_to_one; //< flag to note formats where all samples for a fragment are to be squashed
> +
> +    PacketList *squashed_packet_queue, *squashed_packet_queue_end;
> +    int64_t packet_queue_start_ts, packet_queue_end_ts;
> } MOVTrack;
> 
> typedef enum {
> diff --git a/libavformat/movenc_ttml.c b/libavformat/movenc_ttml.c
> new file mode 100644
> index 0000000000..865efbdbce
> --- /dev/null
> +++ b/libavformat/movenc_ttml.c
> @@ -0,0 +1,243 @@
> +/*
> + * MP4, ISMV Muxer TTML helpers
> + * Copyright (c) 2021 24i
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "avformat.h"
> +#include "avio_internal.h"
> +#include "isom.h"
> +#include "movenc.h"
> +#include "movenc_ttml.h"
> +#include "libavcodec/packet_internal.h"
> +
> +static const unsigned char empty_ttml_document[] =
> +    "<tt xml:lang=\"\" xmlns=\"http://www.w3.org/ns/ttml\" />";
> +
> +static int mov_init_ttml_writer(MOVTrack *track, AVFormatContext **out_ctx)
> +{
> +    AVStream *movenc_stream = track->st, *ttml_stream = NULL;
> +    AVFormatContext *ttml_ctx = NULL;
> +    int ret = AVERROR_BUG;
> +    if ((ret = avformat_alloc_output_context2(&ttml_ctx, NULL,
> +                                              "ttml", NULL)) < 0)
> +        goto fail;
> +
> +    if ((ret = avio_open_dyn_buf(&ttml_ctx->pb)) < 0)
> +        goto fail;
> +
> +    if (!(ttml_stream = avformat_new_stream(ttml_ctx, NULL))) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    if ((ret = avcodec_parameters_copy(ttml_stream->codecpar,
> +                                       movenc_stream->codecpar)) < 0)
> +        goto fail;
> +
> +    ttml_stream->time_base = movenc_stream->time_base;
> +
> +    *out_ctx = ttml_ctx;
> +
> +    return 0;
> +
> +fail:
> +    if (ttml_ctx) {
> +        uint8_t *buf = NULL;
> +        avio_close_dyn_buf(ttml_ctx->pb, &buf);
> +        av_freep(&buf);
> +    }
> +
> +    avformat_free_context(ttml_ctx);
> +
> +    return ret;
> +}
> +
> +static void mov_calculate_start_and_end_based_on_other_tracks(AVFormatContext *s,
> +                                                              MOVTrack *track,
> +                                                              int64_t *start_ts,
> +                                                              int64_t *end_ts)

Can you find a shorter name for this function? Otherwise, I'd suggest a 
different indentation style, e.g. like this:

static void long_function_name(
     type param1, type param2, type param3);

or something like that...

Other than that, the patch looks tolerable I think - I didn't try to 
follow all the internal details of the TTML specific packet handling 
though.

// Martin