[FFmpeg-devel] [PATCH] movenc: Allow writing timed ID3 metadata

Martin Storsjö martin at martin.st
Thu Apr 4 13:29:51 EEST 2024


This is based on a spec at https://aomediacodec.github.io/id3-emsg/,
further based on ISO/IEC 23009-1:2019.

Within libavformat, timed ID3 metadata (already supported by the
mpegts demuxer and muxer) is handled as a separate data AVStream
with codec type AV_CODEC_ID_TIMED_ID3. However, it doesn't
have a corresponding track in the mov file - instead, these events
are written as separate toplevel 'emsg' boxes.
---
 libavformat/movenc.c       | 49 ++++++++++++++++++++++++++++++++-
 libavformat/tests/movenc.c | 55 +++++++++++++++++++++++++++++++++-----
 tests/ref/fate/movenc      |  8 ++++++
 3 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index ccdd2dbfc9..29b1e4bb0f 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -5515,7 +5515,7 @@ static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
 {
     MOVMuxContext *mov = s->priv_data;
     int64_t pos = avio_tell(pb);
-    int has_h264 = 0, has_av1 = 0, has_video = 0, has_dolby = 0;
+    int has_h264 = 0, has_av1 = 0, has_video = 0, has_dolby = 0, has_id3 = 0;
     int has_iamf = 0;
 
     for (int i = 0; i < s->nb_stream_groups; i++) {
@@ -5544,6 +5544,8 @@ static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
                                     st->codecpar->nb_coded_side_data,
                                     AV_PKT_DATA_DOVI_CONF))
             has_dolby = 1;
+        if (st->codecpar->codec_id == AV_CODEC_ID_TIMED_ID3)
+            has_id3 = 1;
     }
 
     avio_wb32(pb, 0); /* size */
@@ -5623,6 +5625,9 @@ static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
     if (mov->flags & FF_MOV_FLAG_DASH && mov->flags & FF_MOV_FLAG_GLOBAL_SIDX)
         ffio_wfourcc(pb, "dash");
 
+    if (has_id3)
+        ffio_wfourcc(pb, "aid3");
+
     return update_size(pb, pos);
 }
 
@@ -6704,6 +6709,34 @@ static int mov_build_iamf_packet(AVFormatContext *s, MOVTrack *trk, AVPacket *pk
     return ret;
 }
 
+static int mov_write_emsg_tag(AVIOContext *pb, AVStream *st, AVPacket *pkt)
+{
+    int64_t pos = avio_tell(pb);
+    const char *scheme_id_uri = "https://aomedia.org/emsg/ID3";
+    const char *value = "";
+
+    av_assert0(st->time_base.num == 1);
+
+    avio_write_marker(pb,
+                      av_rescale_q(pkt->pts, st->time_base, AV_TIME_BASE_Q),
+                      AVIO_DATA_MARKER_BOUNDARY_POINT);
+
+    avio_wb32(pb, 0); /* size */
+    ffio_wfourcc(pb, "emsg");
+    avio_w8(pb, 1); /* version */
+    avio_wb24(pb, 0);
+    avio_wb32(pb, st->time_base.den); /* timescale */
+    avio_wb64(pb, pkt->pts); /* presentation_time */
+    avio_wb32(pb, 0xFFFFFFFFU); /* event_duration */
+    avio_wb32(pb, 0); /* id */
+    /* null terminated UTF8 strings */
+    avio_write(pb, scheme_id_uri, strlen(scheme_id_uri) + 1);
+    avio_write(pb, value, strlen(value) + 1);
+    avio_write(pb, pkt->data, pkt->size);
+
+    return update_size(pb, pos);
+}
+
 static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
     MOVMuxContext *mov = s->priv_data;
@@ -6714,6 +6747,11 @@ static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
         return 1;
     }
 
+    if (s->streams[pkt->stream_index]->codecpar->codec_id == AV_CODEC_ID_TIMED_ID3) {
+        mov_write_emsg_tag(s->pb, s->streams[pkt->stream_index], pkt);
+        return 0;
+    }
+
     trk = s->streams[pkt->stream_index]->priv_data;
 
     if (trk->iamf) {
@@ -7365,6 +7403,12 @@ static int mov_init(AVFormatContext *s)
         AVStream *st = s->streams[i];
         if (st->priv_data)
             continue;
+        // Don't produce a track in the output file for timed ID3 streams.
+        if (st->codecpar->codec_id == AV_CODEC_ID_TIMED_ID3) {
+            // Leave priv_data set to NULL for these AVStreams that don't
+            // have a corresponding track.
+            continue;
+        }
         st->priv_data = st;
         mov->nb_tracks++;
     }
@@ -7462,6 +7506,9 @@ static int mov_init(AVFormatContext *s)
         MOVTrack *track = st->priv_data;
         AVDictionaryEntry *lang = av_dict_get(st->metadata, "language", NULL,0);
 
+        if (!track)
+            continue;
+
         if (!track->st) {
             track->st  = st;
             track->par = st->codecpar;
diff --git a/libavformat/tests/movenc.c b/libavformat/tests/movenc.c
index 12a3632d4e..2fd5c67e76 100644
--- a/libavformat/tests/movenc.c
+++ b/libavformat/tests/movenc.c
@@ -58,7 +58,7 @@ struct AVMD5* md5;
 uint8_t hash[HASH_SIZE];
 
 AVPacket *pkt;
-AVStream *video_st, *audio_st;
+AVStream *video_st, *audio_st, *id3_st;
 int64_t audio_dts, video_dts;
 
 int bframes;
@@ -177,7 +177,7 @@ static void check_func(int value, int line, const char *msg, ...)
 }
 #define check(value, ...) check_func(value, __LINE__, __VA_ARGS__)
 
-static void init_fps(int bf, int audio_preroll, int fps)
+static void init_fps(int bf, int audio_preroll, int fps, int id3)
 {
     AVStream *st;
     int iobuf_size = force_iobuf_size ? force_iobuf_size : sizeof(iobuf);
@@ -226,6 +226,17 @@ static void init_fps(int bf, int audio_preroll, int fps)
     memcpy(st->codecpar->extradata, aac_extradata, sizeof(aac_extradata));
     audio_st = st;
 
+    if (id3) {
+        st = avformat_new_stream(ctx, NULL);
+        if (!st)
+            exit(1);
+        st->codecpar->codec_type = AVMEDIA_TYPE_DATA;
+        st->codecpar->codec_id = AV_CODEC_ID_TIMED_ID3;
+        st->time_base.num = 1;
+        st->time_base.den = 1000;
+        id3_st = st;
+    }
+
     if (avformat_write_header(ctx, &opts) < 0)
         exit(1);
     av_dict_free(&opts);
@@ -245,7 +256,7 @@ static void init_fps(int bf, int audio_preroll, int fps)
 
 static void init(int bf, int audio_preroll)
 {
-    init_fps(bf, audio_preroll, 30);
+    init_fps(bf, audio_preroll, 30, 0);
 }
 
 static void mux_frames(int n, int c)
@@ -316,6 +327,23 @@ static void mux_frames(int n, int c)
     }
 }
 
+static void mux_id3(void)
+{
+    uint8_t pktdata[8] = { 0 };
+    av_packet_unref(pkt);
+
+    pkt->dts = pkt->pts = av_rescale_q(video_dts + (bframes ? duration : 0),
+                                       video_st->time_base, id3_st->time_base);
+    pkt->stream_index = id3_st->index;
+    pkt->duration = 0;
+
+    AV_WB32(pktdata + 4, pkt->pts);
+    pkt->data = pktdata;
+    pkt->size = 8;
+
+    av_write_frame(ctx, pkt);
+}
+
 static void mux_gops(int n)
 {
     mux_frames(gop_size * n, 0);
@@ -708,7 +736,7 @@ int main(int argc, char **argv)
     // by the edit list.
     init_out("vfr");
     av_dict_set(&opts, "movflags", "+frag_keyframe+delay_moov+dash", 0);
-    init_fps(1, 1, 3);
+    init_fps(1, 1, 3, 0);
     mux_frames(gop_size/2, 0);
     duration /= 10;
     mux_frames(gop_size/2, 0);
@@ -727,7 +755,7 @@ int main(int argc, char **argv)
     clear_duration = 1;
     init_out("vfr-noduration");
     av_dict_set(&opts, "movflags", "+frag_keyframe+delay_moov+dash", 0);
-    init_fps(1, 1, 3);
+    init_fps(1, 1, 3, 0);
     mux_frames(gop_size/2, 0);
     duration /= 10;
     mux_frames(gop_size/2, 0);
@@ -743,7 +771,7 @@ int main(int argc, char **argv)
     force_iobuf_size = 1500;
     init_out("large_frag");
     av_dict_set(&opts, "movflags", "+frag_keyframe+delay_moov", 0);
-    init_fps(1, 1, 3);
+    init_fps(1, 1, 3, 0);
     mux_gops(2);
     finish();
     close_out();
@@ -757,7 +785,7 @@ int main(int argc, char **argv)
     init_out("vfr-noduration-interleave");
     av_dict_set(&opts, "movflags", "+frag_keyframe+delay_moov", 0);
     av_dict_set(&opts, "frag_duration", "650000", 0);
-    init_fps(1, 1, 30);
+    init_fps(1, 1, 30, 0);
     mux_frames(gop_size/2, 0);
     // Pretend that the packet duration is the normal, even if
     // we actually skip a bunch of frames. (I.e., simulate that
@@ -794,6 +822,19 @@ int main(int argc, char **argv)
     finish();
     close_out();
 
+    // Write a manually fragmented file, with timed ID3 packets at the head
+    // of each fragment.
+    init_out("emsg");
+    av_dict_set(&opts, "movflags", "+frag_custom+cmaf", 0);
+    init_fps(1, 0, 30, 1);
+    mux_id3();
+    mux_gops(2);
+    av_write_frame(ctx, NULL); // Flush fragment.
+    mux_id3();
+    mux_gops(2);
+    finish();
+    close_out();
+
     av_free(md5);
     av_packet_free(&pkt);
 
diff --git a/tests/ref/fate/movenc b/tests/ref/fate/movenc
index 0c77f5187c..5c12aeb29f 100644
--- a/tests/ref/fate/movenc
+++ b/tests/ref/fate/movenc
@@ -151,3 +151,11 @@ write_data len 900, time 0, type sync atom moof
 write_data len 908, time 1000000, type sync atom moof
 write_data len 148, time nopts, type trailer atom -
 3be575022e446855bca1e45b7942cc0c 3115 empty-moov-neg-cts
+write_data len 28, time nopts, type header atom ftyp
+write_data len 1123, time nopts, type header atom -
+write_data len 70, time 0, type boundary atom emsg
+write_data len 1832, time 0, type sync atom moof
+write_data len 70, time 2000000, type boundary atom emsg
+write_data len 1840, time 2000000, type sync atom moof
+write_data len 148, time nopts, type trailer atom -
+b72c56c795693820b156f452354a51ff 5111 emsg
-- 
2.39.3 (Apple Git-146)



More information about the ffmpeg-devel mailing list