[FFmpeg-devel] [PATCH v3 2/2] avformat/movenc: add support for TTML muxing

Mon Aug 9 16:11:27 EEST 2021

On Thu, Aug 5, 2021 at 10:33 PM Andreas Rheinhardt
<andreas.rheinhardt at outlook.com> wrote:
>
> Jan Ekström:
> > From: Jan Ekström <jan.ekstrom at 24i.com>
> >
> > Includes basic support for both the ISMV ('dfxp') and MP4 ('stpp')
> > methods. This initial version also foregoes fragmentation support
> > in case the built-in sample squashing is to be utilized, as this
> > eases the initial review.
> >
> > Additionally, add basic tests for both muxing modes in MP4.
> >
> > Signed-off-by: Jan Ekström <jan.ekstrom at 24i.com>
> > ---
> >  libavformat/Makefile             |   2 +-
> >  libavformat/isom.h               |   3 +
> >  libavformat/movenc.c             | 179 ++++++++++++++++++++++++++++++-
> >  libavformat/movenc.h             |   5 +
> >  libavformat/movenc_ttml.c        | 178 ++++++++++++++++++++++++++++++
> >  libavformat/movenc_ttml.h        |  31 ++++++
> >  tests/fate/subtitles.mak         |   4 +
> >  tests/ref/fate/sub-ttml-mp4-dfxp |  44 ++++++++
> >  tests/ref/fate/sub-ttml-mp4-stpp |  44 ++++++++
> >  9 files changed, 487 insertions(+), 3 deletions(-)
> >  create mode 100644 libavformat/movenc_ttml.c
> >  create mode 100644 libavformat/movenc_ttml.h
> >  create mode 100644 tests/ref/fate/sub-ttml-mp4-dfxp
> >  create mode 100644 tests/ref/fate/sub-ttml-mp4-stpp
> >
> > diff --git a/libavformat/Makefile b/libavformat/Makefile
> > index 813ddd3c20..7e0f587b41 100644
> > --- a/libavformat/Makefile
> > +++ b/libavformat/Makefile
> > @@ -337,7 +337,7 @@ OBJS-$(CONFIG_MOV_DEMUXER)               += mov.o mov_chan.o mov_esds.o \
> >                                              qtpalette.o replaygain.o
> >  OBJS-$(CONFIG_MOV_MUXER)                 += movenc.o av1.o avc.o hevc.o vpcc.o \
> >                                              movenchint.o mov_chan.o rtp.o \
> > -                                            movenccenc.o rawutils.o
> > +                                            movenccenc.o movenc_ttml.o rawutils.o
> >  OBJS-$(CONFIG_MP2_MUXER)                 += rawenc.o
> >  OBJS-$(CONFIG_MP3_DEMUXER)               += mp3dec.o replaygain.o
> >  OBJS-$(CONFIG_MP3_MUXER)                 += mp3enc.o rawenc.o id3v2enc.o
> > diff --git a/libavformat/isom.h b/libavformat/isom.h
> > index ac1b3f3d56..34a58c79b7 100644
> > --- a/libavformat/isom.h
> > +++ b/libavformat/isom.h
> > @@ -387,4 +387,7 @@ static inline enum AVCodecID ff_mov_get_lpcm_codec_id(int bps, int flags)
> >      return ff_get_pcm_codec_id(bps, flags & 1, flags & 2, flags & 4 ? -1 : 0);
> >  }
> >
> > +#define MOV_ISMV_TTML_TAG MKTAG('d', 'f', 'x', 'p')
> > +#define MOV_MP4_TTML_TAG  MKTAG('s', 't', 'p', 'p')
> > +
> >  #endif /* AVFORMAT_ISOM_H */
> > diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> > index c85efe8748..f3e295ad80 100644
> > --- a/libavformat/movenc.c
> > +++ b/libavformat/movenc.c
> > @@ -56,6 +56,8 @@
> >  #include "hevc.h"
> >  #include "rtpenc.h"
> >  #include "mov_chan.h"
> > +#include "movenc_ttml.h"
> > +#include "ttmlenc.h"
> >  #include "vpcc.h"
> >
> >  static const AVOption options[] = {
> > @@ -119,6 +121,7 @@ static const AVClass mov_isobmff_muxer_class = {
> >  };
> >
> >  static int get_moov_size(AVFormatContext *s);
> > +static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt);
> >
> >  static int utf8len(const uint8_t *b)
> >  {
> > @@ -1787,7 +1790,29 @@ static int mov_write_subtitle_tag(AVIOContext *pb, MOVTrack *track)
> >
> >      if (track->par->codec_id == AV_CODEC_ID_DVD_SUBTITLE)
> >          mov_write_esds_tag(pb, track);
> > -    else if (track->par->extradata_size)
> > +    else if (track->par->codec_id == AV_CODEC_ID_TTML) {
> > +        switch (track->par->codec_tag) {
> > +        case MOV_ISMV_TTML_TAG:
> > +            // ISMV dfxp requires no extradata.
> > +            break;
> > +        case MOV_MP4_TTML_TAG:
> > +            // As specified in 14496-30, XMLSubtitleSampleEntry
> > +            // Namespace
> > +            avio_put_str(pb, "http://www.w3.org/ns/ttml");
> > +            // Empty schema_location
> > +            avio_w8(pb, 0);
> > +            // Empty auxiliary_mime_types
> > +            avio_w8(pb, 0);
> > +            break;
> > +        default:
> > +            av_log(NULL, AV_LOG_ERROR,
> > +                   "Unknown codec tag '%s' utilized for TTML stream with "
> > +                   "index %d (track id %d)!\n",
> > +                   av_fourcc2str(track->par->codec_tag), track->st->index,
> > +                   track->track_id);
> > +            return AVERROR(EINVAL);
> > +        }
> > +    } else if (track->par->extradata_size)
> >          avio_write(pb, track->par->extradata, track->par->extradata_size);
> >
> >      if (track->mode == MODE_MP4 &&
> > @@ -2661,6 +2686,14 @@ static int mov_write_nmhd_tag(AVIOContext *pb)
> >      return 12;
> >  }
> >
> > +static int mov_write_sthd_tag(AVIOContext *pb)
> > +{
> > +    avio_wb32(pb, 12);
> > +    ffio_wfourcc(pb, "sthd");
> > +    avio_wb32(pb, 0);
> > +    return 12;
> > +}
> > +
> >  static int mov_write_tcmi_tag(AVIOContext *pb, MOVTrack *track)
> >  {
> >      int64_t pos = avio_tell(pb);
> > @@ -2787,6 +2820,8 @@ static int mov_write_hdlr_tag(AVFormatContext *s, AVIOContext *pb, MOVTrack *tra
> >                      hdlr_type = "sbtl";
> >                  } else if (track->tag == MKTAG('m','p','4','s')) {
> >                      hdlr_type = "subp";
> > +                } else if (track->tag == MOV_MP4_TTML_TAG) {
> > +                    hdlr_type = "subt";
> >                  } else {
> >                      hdlr_type = "text";
> >                  }
> > @@ -2865,6 +2900,8 @@ static int mov_write_minf_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
> >      else if (track->par->codec_type == AVMEDIA_TYPE_SUBTITLE) {
> >          if (track->tag == MKTAG('t','e','x','t') || is_clcp_track(track)) {
> >              mov_write_gmhd_tag(pb, track);
> > +        } else if (track->tag == MOV_MP4_TTML_TAG) {
> > +            mov_write_sthd_tag(pb);
> >          } else {
> >              mov_write_nmhd_tag(pb);
> >          }
> > @@ -5253,6 +5290,68 @@ static int mov_flush_fragment_interleaving(AVFormatContext *s, MOVTrack *track)
> >      return 0;
> >  }
> >
> > +static int mov_write_squashed_packet(AVFormatContext *s, MOVTrack *track)
> > +{
> > +    MOVMuxContext *mov = s->priv_data;
> > +    AVPacket *squashed_packet = mov->pkt;
> > +    int ret = AVERROR_BUG;
> > +
> > +    switch (track->st->codecpar->codec_id) {
> > +    case AV_CODEC_ID_TTML: {
> > +        int had_packets = !!track->squashed_packet_queue;
> > +
> > +        if ((ret = ff_mov_generate_squashed_ttml_packet(s, track, squashed_packet)) < 0) {
> > +            goto finish_squash;
> > +        }
> > +
> > +        // We have generated a padding packet (no actual input packets in
> > +        // queue) and its duration is zero. Skipping writing it.
> > +        if (!had_packets && squashed_packet->duration == 0) {
> > +            goto finish_squash;
> > +        }
> > +
> > +        track->end_reliable = 1;
> > +        break;
> > +    }
> > +    default:
> > +        ret = AVERROR(EINVAL);
> > +        goto finish_squash;
> > +    }
> > +
> > +    squashed_packet->stream_index = track->st->index;
> > +
> > +    ret = mov_write_single_packet(s, squashed_packet);
> > +
> > +finish_squash:
> > +    av_packet_unref(squashed_packet);
> > +
> > +    return ret;
> > +}
> > +
> > +static int mov_write_squashed_packets(AVFormatContext *s)
> > +{
> > +    MOVMuxContext *mov = s->priv_data;
> > +
> > +    for (int i = 0; i < s->nb_streams; i++) {
> > +        MOVTrack *track = &mov->tracks[i];
> > +        int ret = AVERROR_BUG;
> > +
> > +        if (track->squash_fragment_samples_to_one && !track->entry) {
> > +            if ((ret = mov_write_squashed_packet(s, track)) < 0) {
> > +                av_log(s, AV_LOG_ERROR,
> > +                       "Failed to write squashed packet for %s stream with "
> > +                       "index %d and track id %d. Error: %s\n",
> > +                       avcodec_get_name(track->st->codecpar->codec_id),
> > +                       track->st->index, track->track_id,
> > +                       av_err2str(ret));
> > +                return ret;
> > +            }
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int mov_flush_fragment(AVFormatContext *s, int force)
> >  {
> >      MOVMuxContext *mov = s->priv_data;
> > @@ -5264,6 +5363,11 @@ static int mov_flush_fragment(AVFormatContext *s, int force)
> >      if (!(mov->flags & FF_MOV_FLAG_FRAGMENT))
> >          return 0;
> >
> > +    // Check if we have any tracks that require squashing.
> > +    // In that case, we'll have to write the packet here.
> > +    if ((ret = mov_write_squashed_packets(s)) < 0)
> > +        return ret;
> > +
> >      // Try to fill in the duration of the last packet in each stream
> >      // from queued packets in the interleave queues. If the flushing
> >      // of fragments was triggered automatically by an AVPacket, we
> > @@ -5739,7 +5843,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
> >      trk->cluster[trk->entry].entries          = samples_in_chunk;
> >      trk->cluster[trk->entry].dts              = pkt->dts;
> >      trk->cluster[trk->entry].pts              = pkt->pts;
> > -    if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
> > +    if (!trk->squash_fragment_samples_to_one &&
> > +        !trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
> >          if (!trk->frag_discont) {
> >              /* First packet of a new fragment. We already wrote the duration
> >               * of the last packet of the previous fragment based on track_duration,
> > @@ -6032,6 +6137,33 @@ static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
> >              }
> >          }
> >
> > +        if (trk->squash_fragment_samples_to_one) {
> > +            /*
> > +             * If the track has to have its samples squashed into one sample,
> > +             * we just take it into the track's queue.
> > +             * This will then be utilized as the samples get written in either
> > +             * mov_flush_fragment or when the mux is finalized in
> > +             * mov_write_trailer.
> > +             */
> > +            int ret = AVERROR_BUG;
> > +
> > +            if (pkt->pts == AV_NOPTS_VALUE) {
> > +                av_log(s, AV_LOG_ERROR,
> > +                       "Packets without a valid presentation timestamp are "
> > +                       "not supported with packet squashing!\n");
> > +                return AVERROR(EINVAL);
> > +            }
> > +
> > +            if ((ret = avpriv_packet_list_put(&trk->squashed_packet_queue,
> > +                                              &trk->squashed_packet_queue_end,
> > +                                              pkt, av_packet_ref, 0)) < 0) {
> > +                return ret;
> > +            }
> > +
> > +            return 0;
> > +        }
> > +
> > +
> >          if (trk->mode == MODE_MOV && trk->par->codec_type == AVMEDIA_TYPE_VIDEO) {
> >              AVPacket *opkt = pkt;
> >              int reshuffle_ret, ret;
> > @@ -6310,6 +6442,11 @@ static void mov_free(AVFormatContext *s)
> >
> >          ff_mov_cenc_free(&mov->tracks[i].cenc);
> >          ffio_free_dyn_buf(&mov->tracks[i].mdat_buf);
> > +
> > +        if (mov->tracks[i].squashed_packet_queue) {
> > +            avpriv_packet_list_free(&(mov->tracks[i].squashed_packet_queue),
> > +                                    &(mov->tracks[i].squashed_packet_queue_end));
> > +        }
> >      }
> >
> >      av_freep(&mov->tracks);
> > @@ -6700,6 +6837,36 @@ static int mov_init(AVFormatContext *s)
> >              }
> >          } else if (st->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
> >              track->timescale = st->time_base.den;
> > +
> > +            if (track->par->codec_id == AV_CODEC_ID_TTML) {
> > +                /* 14496-30 requires us to use a single sample per fragment
> > +                   for TTML, for which we define a per-track flag.
> > +
> > +                   We set the flag in case we are receiving TTML paragraphs
> > +                   from the input, in other words in case we are not doing
> > +                   stream copy. */
> > +                track->squash_fragment_samples_to_one =
> > +                    ff_is_ttml_stream_paragraph_based(track->par);
> > +
> > +                if (mov->flags & FF_MOV_FLAG_FRAGMENT &&
> > +                    track->squash_fragment_samples_to_one) {
> > +                    av_log(s, AV_LOG_ERROR,
> > +                           "Fragmentation is not currently supported for "
> > +                           "TTML in MP4/ISMV (track synchronization between "
> > +                           "subtitles and other media is not yet implemented)!\n");
> > +                    return AVERROR_PATCHWELCOME;
> > +                }
> > +
> > +                if (track->mode != MODE_ISM &&
> > +                    track->par->codec_tag == MOV_ISMV_TTML_TAG &&
> > +                    s->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
> > +                    av_log(s, AV_LOG_ERROR,
> > +                           "ISMV style TTML support with the 'dfxp' tag in "
> > +                           "non-ISMV formats is not officially supported. Add "
> > +                           "'-strict unofficial' if you want to use it.\n");
> > +                    return AVERROR_EXPERIMENTAL;
> > +                }
> > +            }
> >          } else if (st->codecpar->codec_type == AVMEDIA_TYPE_DATA) {
> >              track->timescale = st->time_base.den;
> >          } else {
> > @@ -7046,6 +7213,11 @@ static int mov_write_trailer(AVFormatContext *s)
> >          }
> >      }
> >
> > +    // Check if we have any tracks that require squashing.
> > +    // In that case, we'll have to write the packet here.
> > +    if ((res = mov_write_squashed_packets(s)) < 0)
> > +        return res;
> > +
> >      // If there were no chapters when the header was written, but there
> >      // are chapters now, write them in the trailer.  This only works
> >      // when we are not doing fragments.
> > @@ -7190,6 +7362,8 @@ static const AVCodecTag codec_mp4_tags[] = {
> >      { AV_CODEC_ID_MOV_TEXT,        MKTAG('t', 'x', '3', 'g') },
> >      { AV_CODEC_ID_BIN_DATA,        MKTAG('g', 'p', 'm', 'd') },
> >      { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') },
> > +    { AV_CODEC_ID_TTML,            MOV_MP4_TTML_TAG          },
> > +    { AV_CODEC_ID_TTML,            MOV_ISMV_TTML_TAG         },
> >      { AV_CODEC_ID_NONE,               0 },
> >  };
> >  #if CONFIG_MP4_MUXER || CONFIG_PSP_MUXER
> > @@ -7198,6 +7372,7 @@ static const AVCodecTag *const mp4_codec_tags_list[] = { codec_mp4_tags, NULL };
> >
> >  static const AVCodecTag codec_ism_tags[] = {
> >      { AV_CODEC_ID_WMAPRO      , MKTAG('w', 'm', 'a', ' ') },
> > +    { AV_CODEC_ID_TTML        , MOV_ISMV_TTML_TAG         },
> >      { AV_CODEC_ID_NONE        ,    0 },
> >  };
> >
> > diff --git a/libavformat/movenc.h b/libavformat/movenc.h
> > index af1ea0bce6..95db1bf46d 100644
> > --- a/libavformat/movenc.h
> > +++ b/libavformat/movenc.h
> > @@ -26,6 +26,7 @@
> >
> >  #include "avformat.h"
> >  #include "movenccenc.h"
> > +#include "libavcodec/packet_internal.h"
> >
> >  #define MOV_FRAG_INFO_ALLOC_INCREMENT 64
> >  #define MOV_INDEX_CLUSTER_SIZE 1024
> > @@ -164,6 +165,10 @@ typedef struct MOVTrack {
> >      int pal_done;
> >
> >      int is_unaligned_qt_rgb;
> > +
> > +    unsigned int squash_fragment_samples_to_one; //< flag to note formats where all samples for a fragment are to be squashed
> > +
> > +    PacketList *squashed_packet_queue, *squashed_packet_queue_end;
> >  } MOVTrack;
> >
> >  typedef enum {
> > diff --git a/libavformat/movenc_ttml.c b/libavformat/movenc_ttml.c
> > new file mode 100644
> > index 0000000000..bf4a6fd89e
> > --- /dev/null
> > +++ b/libavformat/movenc_ttml.c
> > @@ -0,0 +1,178 @@
> > +/*
> > + * MP4, ISMV Muxer TTML helpers
> > + * Copyright (c) 2021 24i
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "avformat.h"
> > +#include "avio_internal.h"
> > +#include "isom.h"
> > +#include "movenc.h"
> > +#include "movenc_ttml.h"
> > +#include "libavcodec/packet_internal.h"
> > +
> > +static const unsigned char empty_ttml_document[] =
> > +    "<tt xml:lang=\"\" xmlns=\"http://www.w3.org/ns/ttml\" />";
> > +
> > +static int mov_init_ttml_writer(MOVTrack *track, AVFormatContext **out_ctx)
> > +{
> > +    AVStream *movenc_stream = track->st, *ttml_stream = NULL;
> > +    AVFormatContext *ttml_ctx = NULL;
> > +    int ret = AVERROR_BUG;
> > +    if ((ret = avformat_alloc_output_context2(&ttml_ctx, NULL,
> > +                                              "ttml", NULL)) < 0)
> > +        return ret;
> > +
> > +    if ((ret = avio_open_dyn_buf(&ttml_ctx->pb)) < 0)
> > +        goto fail;
> > +
> > +    if (!(ttml_stream = avformat_new_stream(ttml_ctx, NULL))) {
> > +        ret = AVERROR(ENOMEM);
> > +        goto fail;
> > +    }
> > +
> > +    if ((ret = avcodec_parameters_copy(ttml_stream->codecpar,
> > +                                       movenc_stream->codecpar)) < 0)
> > +        goto fail;
> > +
> > +    ttml_stream->time_base = movenc_stream->time_base;
> > +
> > +    *out_ctx = ttml_ctx;
> > +
> > +    return 0;
> > +
> > +fail:
> > +    ffio_free_dyn_buf(&ttml_ctx->pb);
> > +    avformat_free_context(ttml_ctx);
>
> If you used out_ctx directly (i.e. don't use ttml_ctx at all), you could
> remove the cleanup code in
>  this function and instead reuse the cleanup code in
>  ff_mov_generate_squashed_ttml_packet().
>  (But you will have to add a check for whether ttml_ctx exists in
> ff_mov_generate_squashed_ttml_packet().)
>
>

I'm kind of on the edge with this. I agree with trying to deduplicate
logic, but also functions being self-contained is nice.

I thus requested some comments comments from other people and Anton
noted that he preferred the self-containment of this function, albeit
only slightly.

Personally I'm mostly worn out with this stuff, so "meh". If you feel
heavily for this deduplication, I can add it to the next (and
hopefully final) revision of this patch set.

Jan