[FFmpeg-devel] [PATCH 2/2] avformat/movenc: parse h264 packets to build Sync Sample and Recovery Point tables

Thu Jul 29 01:43:43 EEST 2021

James Almer:
> Since we can't blindly trust the keyframe flag in packets and assume its
> contents are a valid Sync Sample, do some basic bitstream parsing to build the
> Sync Sample table in addition to a Random Access Recovery Point table.
> 
> Suggested-by: ffmpeg at fb.com
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
>  libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>  libavformat/movenc.h         |   1 +
>  tests/ref/lavf-fate/h264.mp4 |   6 +-
>  3 files changed, 123 insertions(+), 9 deletions(-)
> 
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index 57062f45c5..159e0261b7 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -34,13 +34,17 @@
>  #include "avc.h"
>  #include "libavcodec/ac3_parser_internal.h"
>  #include "libavcodec/dnxhddata.h"
> +#include "libavcodec/h264.h"
> +#include "libavcodec/h2645_parse.h"
>  #include "libavcodec/flac.h"
>  #include "libavcodec/get_bits.h"
> +#include "libavcodec/golomb.h"
>  
>  #include "libavcodec/internal.h"
>  #include "libavcodec/put_bits.h"
>  #include "libavcodec/vc1_common.h"
>  #include "libavcodec/raw.h"
> +#include "libavcodec/sei.h"
>  #include "internal.h"
>  #include "libavutil/avstring.h"
>  #include "libavutil/channel_layout.h"
> @@ -2537,7 +2541,9 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>      if (!sgpd_entries)
>          return AVERROR(ENOMEM);
>  
> -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
> +               track->par->codec_id == AV_CODEC_ID_H264);
>  
>      if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>          for (i = 0; i < track->entry; i++) {
> @@ -2545,7 +2551,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>              int distance = 0;
>              for (j = i - 1; j >= 0; j--) {
>                  roll_samples_remaining -= get_cluster_duration(track, j);
> -                distance++;
> +                distance--;
>                  if (roll_samples_remaining <= 0)
>                      break;
>              }
> @@ -2555,7 +2561,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>              if (roll_samples_remaining > 0)
>                  distance = 0;
>              /* Verify distance is a maximum of 32 (2.5ms) packets. */
> -            if (distance > 32)
> +            if (distance < 32)
>                  return AVERROR_INVALIDDATA;
>              if (i && distance == sgpd_entries[entries].roll_distance) {
>                  sgpd_entries[entries].count++;
> @@ -2566,10 +2572,22 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>                  sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>              }
>          }
> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
> +        for (i = 0; i < track->entry; i++) {
> +            int distance = track->cluster[i].roll_distance;
> +            if (i && distance == sgpd_entries[entries].roll_distance) {
> +                sgpd_entries[entries].count++;
> +            } else {
> +                entries++;
> +                sgpd_entries[entries].count = 1;
> +                sgpd_entries[entries].roll_distance = distance;
> +                sgpd_entries[entries].group_description_index = distance ? ++group : 0;
> +            }
> +        }
>      } else {
>          entries++;
>          sgpd_entries[entries].count = track->sample_count;
> -        sgpd_entries[entries].roll_distance = 1;
> +        sgpd_entries[entries].roll_distance = -1;

You seem to be negate roll_distance in this patch; shouldn't this be a
separate patch?

>          sgpd_entries[entries].group_description_index = ++group;
>      }
>      entries++;
> @@ -2588,7 +2606,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>      avio_wb32(pb, group); /* entry_count */
>      for (i = 0; i < entries; i++) {
>          if (sgpd_entries[i].group_description_index) {
> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
>          }
>      }
>  
> @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>      if (track->cenc.aes_ctr) {
>          ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>      }
> -    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
> +        track->par->codec_id == AV_CODEC_ID_H264) {
>          mov_preroll_write_stbl_atoms(pb, track);
>      }
>      return update_size(pb, pos);
> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t *flags)
>      return 0;
>  }
>  
> +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
> +{
> +    GetBitContext gb;
> +    const uint8_t *buf = pkt->data;
> +    const uint8_t *buf_end = pkt->data + pkt->size;
> +    uint32_t state = -1;
> +    unsigned roll_distance = 0;
> +    int nal_length_size = 0, nalsize;
> +    int idr = 0;
> +
> +    if (!pkt->size)
> +        return 0;
> +    if (trk->vos_data && trk->vos_data[0] == 1) {
> +        if (trk->vos_len < 5)
> +            return 0;
> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
> +    }
> +
> +    while (buf_end - buf >= 4) {
> +        if (nal_length_size) {
> +            int i = 0;
> +            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, NULL);
> +            if (nalsize < 0)
> +                break;
> +            state = buf[nal_length_size];
> +            buf += nal_length_size + 1;
> +        } else {
> +            buf = avpriv_find_start_code(buf, buf_end, &state);

This checks the whole packet for start codes. But given that we have to
convert annex B input to ISOBMFF format anyway, this separate codepath
seems avoidable by working with already reformatted data here.
(In any case: In case this is not an IDR access unit you really check
the whole packet, even when you have already encountered a VLC NALU
despite SEIs having to be in front of them.)

> +            if (buf >= buf_end)
> +                break;
> +        }
> +
> +        switch (state & 0x1f) {
> +        case H264_NAL_IDR_SLICE:
> +            idr = 1;
> +            goto end;
> +        case H264_NAL_SEI:
> +            init_get_bits8(&gb, buf, buf_end - buf);

Weird that you checked the init_get_bits8() below but not this one.

> +            do {
> +                GetBitContext gb_payload;
> +                int ret, type = 0, size = 0;
> +
> +                do {
> +                    if (get_bits_left(&gb) < 8)
> +                        goto end;
> +                    type += show_bits(&gb, 8);
> +                } while (get_bits(&gb, 8) == 255);
> +                do {
> +                    if (get_bits_left(&gb) < 8)
> +                        goto end;
> +                    size += show_bits(&gb, 8);
> +                } while (get_bits(&gb, 8) == 255);
> +
> +                if (size > get_bits_left(&gb) / 8)
> +                    goto end;
> +
> +                ret = init_get_bits8(&gb_payload, gb.buffer + get_bits_count(&gb) / 8, size);
> +                if (ret < 0)
> +                    goto end;
> +
> +                switch (type) {
> +                case SEI_TYPE_RECOVERY_POINT:
> +                    roll_distance = get_ue_golomb_long(&gb_payload);
> +                    break;
> +                default:
> +                    break;
> +                }
> +                skip_bits_long(&gb, 8 * size);
> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);

1. You are using the outer bitreader gb only for byte-aligned
reads/skips; better read bytes directly.
2. You are not unescaping the buffer: The SEI size is the size of an
unescaped SEI (after 0x03 has been stripped away). Knowing the size in
advance comes in really handy for this; this is where only working with
mp4-formatted data brings benefits.

> +            break;
> +        default:
> +            break;
> +        }
> +        if (nal_length_size)
> +            buf += nalsize - 1;
> +    }
> +
> +end:
> +    if (roll_distance != (int16_t)roll_distance)

The H.264 spec restricts this field to values in the range
0..MaxFrameNum - 1, where the latter can be UINT16_MAX. So I don't
understand why you are using a signed int16_t here and as type in MOVIentry.

> +        roll_distance = 0;
> +    trk->cluster[trk->entry].roll_distance = roll_distance;
> +
> +    if (idr) {
> +        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
> +        trk->has_keyframes++;
> +    }
> +
> +    return 0;
> +}
> +
>  static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
>  {
>      const uint8_t *start, *next, *end = pkt->data + pkt->size;
> @@ -5740,6 +5850,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>      trk->cluster[trk->entry].entries          = samples_in_chunk;
>      trk->cluster[trk->entry].dts              = pkt->dts;
>      trk->cluster[trk->entry].pts              = pkt->pts;
> +    trk->cluster[trk->entry].roll_distance    = 0;
>      if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
>          if (!trk->frag_discont) {
>              /* First packet of a new fragment. We already wrote the duration
> @@ -5821,6 +5932,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>  
>      if (par->codec_id == AV_CODEC_ID_VC1) {
>          mov_parse_vc1_frame(pkt, trk);
> +    } else if (par->codec_id == AV_CODEC_ID_H264) {
> +        mov_parse_h264_frame(pkt, trk);
>      } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
>          mov_parse_truehd_frame(pkt, trk);
>      } else if (pkt->flags & AV_PKT_FLAG_KEY) {
> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
> index af1ea0bce6..73bf73ce8f 100644
> --- a/libavformat/movenc.h
> +++ b/libavformat/movenc.h
> @@ -56,6 +56,7 @@ typedef struct MOVIentry {
>  #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
>  #define MOV_DISPOSABLE_SAMPLE   0x0004
>      uint32_t     flags;
> +    int16_t      roll_distance;
>      AVProducerReferenceTime prft;
>  } MOVIentry;
>  
> diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4
> index a9c3823c2c..c08ee4c7ae 100644
> --- a/tests/ref/lavf-fate/h264.mp4
> +++ b/tests/ref/lavf-fate/h264.mp4
> @@ -1,3 +1,3 @@
> -fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
> -547928 tests/data/lavf-fate/lavf.h264.mp4
> -tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
> +2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
> +548084 tests/data/lavf-fate/lavf.h264.mp4
> +tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829
> 
I guess this filesize increase is due to the SyncSample table? What
exactly has been written before this patch, what gets written with it?

- Andreas