[FFmpeg-devel] [PATCH 2/2] avformat/movenc: parse h264 packets to build Sync Sample and Recovery Point tables

Thu Jul 29 02:55:15 EEST 2021

On 7/28/2021 7:43 PM, Andreas Rheinhardt wrote:
> James Almer:
>> Since we can't blindly trust the keyframe flag in packets and assume its
>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>> Sync Sample table in addition to a Random Access Recovery Point table.
>>
>> Suggested-by: ffmpeg at fb.com
>> Signed-off-by: James Almer <jamrial at gmail.com>
>> ---
>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>   libavformat/movenc.h         |   1 +
>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>   3 files changed, 123 insertions(+), 9 deletions(-)
>>
>> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
>> index 57062f45c5..159e0261b7 100644
>> --- a/libavformat/movenc.c
>> +++ b/libavformat/movenc.c
>> @@ -34,13 +34,17 @@
>>   #include "avc.h"
>>   #include "libavcodec/ac3_parser_internal.h"
>>   #include "libavcodec/dnxhddata.h"
>> +#include "libavcodec/h264.h"
>> +#include "libavcodec/h2645_parse.h"
>>   #include "libavcodec/flac.h"
>>   #include "libavcodec/get_bits.h"
>> +#include "libavcodec/golomb.h"
>>   
>>   #include "libavcodec/internal.h"
>>   #include "libavcodec/put_bits.h"
>>   #include "libavcodec/vc1_common.h"
>>   #include "libavcodec/raw.h"
>> +#include "libavcodec/sei.h"
>>   #include "internal.h"
>>   #include "libavutil/avstring.h"
>>   #include "libavutil/channel_layout.h"
>> @@ -2537,7 +2541,9 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>       if (!sgpd_entries)
>>           return AVERROR(ENOMEM);
>>   
>> -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
>> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
>> +               track->par->codec_id == AV_CODEC_ID_H264);
>>   
>>       if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>>           for (i = 0; i < track->entry; i++) {
>> @@ -2545,7 +2551,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>               int distance = 0;
>>               for (j = i - 1; j >= 0; j--) {
>>                   roll_samples_remaining -= get_cluster_duration(track, j);
>> -                distance++;
>> +                distance--;
>>                   if (roll_samples_remaining <= 0)
>>                       break;
>>               }
>> @@ -2555,7 +2561,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>               if (roll_samples_remaining > 0)
>>                   distance = 0;
>>               /* Verify distance is a maximum of 32 (2.5ms) packets. */
>> -            if (distance > 32)
>> +            if (distance < 32)
>>                   return AVERROR_INVALIDDATA;
>>               if (i && distance == sgpd_entries[entries].roll_distance) {
>>                   sgpd_entries[entries].count++;
>> @@ -2566,10 +2572,22 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>                   sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>>               }
>>           }
>> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
>> +        for (i = 0; i < track->entry; i++) {
>> +            int distance = track->cluster[i].roll_distance;
>> +            if (i && distance == sgpd_entries[entries].roll_distance) {
>> +                sgpd_entries[entries].count++;
>> +            } else {
>> +                entries++;
>> +                sgpd_entries[entries].count = 1;
>> +                sgpd_entries[entries].roll_distance = distance;
>> +                sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>> +            }
>> +        }
>>       } else {
>>           entries++;
>>           sgpd_entries[entries].count = track->sample_count;
>> -        sgpd_entries[entries].roll_distance = 1;
>> +        sgpd_entries[entries].roll_distance = -1;
> 
> You seem to be negate roll_distance in this patch; shouldn't this be a
> separate patch?

I'll split it.

> 
>>           sgpd_entries[entries].group_description_index = ++group;
>>       }
>>       entries++;
>> @@ -2588,7 +2606,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>       avio_wb32(pb, group); /* entry_count */
>>       for (i = 0; i < entries; i++) {
>>           if (sgpd_entries[i].group_description_index) {
>> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
>> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
>>           }
>>       }
>>   
>> @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>>       if (track->cenc.aes_ctr) {
>>           ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>>       }
>> -    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
>> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
>> +        track->par->codec_id == AV_CODEC_ID_H264) {
>>           mov_preroll_write_stbl_atoms(pb, track);
>>       }
>>       return update_size(pb, pos);
>> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t *flags)
>>       return 0;
>>   }
>>   
>> +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
>> +{
>> +    GetBitContext gb;
>> +    const uint8_t *buf = pkt->data;
>> +    const uint8_t *buf_end = pkt->data + pkt->size;
>> +    uint32_t state = -1;
>> +    unsigned roll_distance = 0;
>> +    int nal_length_size = 0, nalsize;
>> +    int idr = 0;
>> +
>> +    if (!pkt->size)
>> +        return 0;
>> +    if (trk->vos_data && trk->vos_data[0] == 1) {
>> +        if (trk->vos_len < 5)
>> +            return 0;
>> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
>> +    }
>> +
>> +    while (buf_end - buf >= 4) {
>> +        if (nal_length_size) {
>> +            int i = 0;
>> +            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, NULL);
>> +            if (nalsize < 0)
>> +                break;
>> +            state = buf[nal_length_size];
>> +            buf += nal_length_size + 1;
>> +        } else {
>> +            buf = avpriv_find_start_code(buf, buf_end, &state);
> 
> This checks the whole packet for start codes. But given that we have to
> convert annex B input to ISOBMFF format anyway, this separate codepath
> seems avoidable by working with already reformatted data here.
> (In any case: In case this is not an IDR access unit you really check
> the whole packet, even when you have already encountered a VLC NALU
> despite SEIs having to be in front of them.)

I'll look into it.

> 
>> +            if (buf >= buf_end)
>> +                break;
>> +        }
>> +
>> +        switch (state & 0x1f) {
>> +        case H264_NAL_IDR_SLICE:
>> +            idr = 1;
>> +            goto end;
>> +        case H264_NAL_SEI:
>> +            init_get_bits8(&gb, buf, buf_end - buf);
> 
> Weird that you checked the init_get_bits8() below but not this one.

The one below was copy-pasted. Will also check this one.

> 
>> +            do {
>> +                GetBitContext gb_payload;
>> +                int ret, type = 0, size = 0;
>> +
>> +                do {
>> +                    if (get_bits_left(&gb) < 8)
>> +                        goto end;
>> +                    type += show_bits(&gb, 8);
>> +                } while (get_bits(&gb, 8) == 255);
>> +                do {
>> +                    if (get_bits_left(&gb) < 8)
>> +                        goto end;
>> +                    size += show_bits(&gb, 8);
>> +                } while (get_bits(&gb, 8) == 255);
>> +
>> +                if (size > get_bits_left(&gb) / 8)
>> +                    goto end;
>> +
>> +                ret = init_get_bits8(&gb_payload, gb.buffer + get_bits_count(&gb) / 8, size);
>> +                if (ret < 0)
>> +                    goto end;
>> +
>> +                switch (type) {
>> +                case SEI_TYPE_RECOVERY_POINT:
>> +                    roll_distance = get_ue_golomb_long(&gb_payload);
>> +                    break;
>> +                default:
>> +                    break;
>> +                }
>> +                skip_bits_long(&gb, 8 * size);
>> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);
> 
> 1. You are using the outer bitreader gb only for byte-aligned
> reads/skips; better read bytes directly.

Using GetBitContext is cleaner and clearer IMO, but I could use 
GetByteContext instead i guess. I'd rather not work directly on bytes.

> 2. You are not unescaping the buffer: The SEI size is the size of an
> unescaped SEI (after 0x03 has been stripped away). Knowing the size in
> advance comes in really handy for this; this is where only working with
> mp4-formatted data brings benefits.

A SEI NALu may have more than one SEI message, and the Recovery Point 
may not be the first. If i can't trust the size read in the loop above 
because it does not take into account the emulation prevention bytes, 
how can i feasibly skip each SEI message within a give NALU?

> 
>> +            break;
>> +        default:
>> +            break;
>> +        }
>> +        if (nal_length_size)
>> +            buf += nalsize - 1;
>> +    }
>> +
>> +end:
>> +    if (roll_distance != (int16_t)roll_distance)
> 
> The H.264 spec restricts this field to values in the range
> 0..MaxFrameNum - 1, where the latter can be UINT16_MAX. So I don't
> understand why you are using a signed int16_t here and as type in MOVIentry.

Because ISOBMFF's roll_distance is an int16 field.

> 
>> +        roll_distance = 0;
>> +    trk->cluster[trk->entry].roll_distance = roll_distance;
>> +
>> +    if (idr) {
>> +        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
>> +        trk->has_keyframes++;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
>>   {
>>       const uint8_t *start, *next, *end = pkt->data + pkt->size;
>> @@ -5740,6 +5850,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>>       trk->cluster[trk->entry].entries          = samples_in_chunk;
>>       trk->cluster[trk->entry].dts              = pkt->dts;
>>       trk->cluster[trk->entry].pts              = pkt->pts;
>> +    trk->cluster[trk->entry].roll_distance    = 0;
>>       if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
>>           if (!trk->frag_discont) {
>>               /* First packet of a new fragment. We already wrote the duration
>> @@ -5821,6 +5932,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>>   
>>       if (par->codec_id == AV_CODEC_ID_VC1) {
>>           mov_parse_vc1_frame(pkt, trk);
>> +    } else if (par->codec_id == AV_CODEC_ID_H264) {
>> +        mov_parse_h264_frame(pkt, trk);
>>       } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
>>           mov_parse_truehd_frame(pkt, trk);
>>       } else if (pkt->flags & AV_PKT_FLAG_KEY) {
>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>> index af1ea0bce6..73bf73ce8f 100644
>> --- a/libavformat/movenc.h
>> +++ b/libavformat/movenc.h
>> @@ -56,6 +56,7 @@ typedef struct MOVIentry {
>>   #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
>>   #define MOV_DISPOSABLE_SAMPLE   0x0004
>>       uint32_t     flags;
>> +    int16_t      roll_distance;
>>       AVProducerReferenceTime prft;
>>   } MOVIentry;
>>   
>> diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4
>> index a9c3823c2c..c08ee4c7ae 100644
>> --- a/tests/ref/lavf-fate/h264.mp4
>> +++ b/tests/ref/lavf-fate/h264.mp4
>> @@ -1,3 +1,3 @@
>> -fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
>> -547928 tests/data/lavf-fate/lavf.h264.mp4
>> -tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
>> +2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
>> +548084 tests/data/lavf-fate/lavf.h264.mp4
>> +tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829
>>
> I guess this filesize increase is due to the SyncSample table? What
> exactly has been written before this patch, what gets written with it?

The size increase is due to the addition of Recovery Point sample 
groups. The Sync Sample table is smaller now that we're not listing the 
recovery point packets in it, but ultimately the file is bigger because 
of the presence of a sgpd and sbgp full boxes.

> 
> - Andreas
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>