[FFmpeg-devel] [PATCH 2/2] avformat/movenc: parse h264 packets to build Sync Sample and Recovery Point tables

Thu Jul 29 13:23:27 EEST 2021

James Almer:
> On 7/28/2021 7:43 PM, Andreas Rheinhardt wrote:
>> James Almer:
>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>> contents are a valid Sync Sample, do some basic bitstream parsing to
>>> build the
>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>
>>> Suggested-by: ffmpeg at fb.com
>>> Signed-off-by: James Almer <jamrial at gmail.com>
>>> ---
>>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>   libavformat/movenc.h         |   1 +
>>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>   3 files changed, 123 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
>>> index 57062f45c5..159e0261b7 100644
>>> --- a/libavformat/movenc.c
>>> +++ b/libavformat/movenc.c
>>> @@ -34,13 +34,17 @@
>>>   #include "avc.h"
>>>   #include "libavcodec/ac3_parser_internal.h"
>>>   #include "libavcodec/dnxhddata.h"
>>> +#include "libavcodec/h264.h"
>>> +#include "libavcodec/h2645_parse.h"
>>>   #include "libavcodec/flac.h"
>>>   #include "libavcodec/get_bits.h"
>>> +#include "libavcodec/golomb.h"
>>>     #include "libavcodec/internal.h"
>>>   #include "libavcodec/put_bits.h"
>>>   #include "libavcodec/vc1_common.h"
>>>   #include "libavcodec/raw.h"
>>> +#include "libavcodec/sei.h"
>>>   #include "internal.h"
>>>   #include "libavutil/avstring.h"
>>>   #include "libavutil/channel_layout.h"
>>> @@ -2537,7 +2541,9 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>       if (!sgpd_entries)
>>>           return AVERROR(ENOMEM);
>>>   -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> track->par->codec_id == AV_CODEC_ID_AAC);
>>> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
>>> +               track->par->codec_id == AV_CODEC_ID_H264);
>>>         if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>>>           for (i = 0; i < track->entry; i++) {
>>> @@ -2545,7 +2551,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>               int distance = 0;
>>>               for (j = i - 1; j >= 0; j--) {
>>>                   roll_samples_remaining -=
>>> get_cluster_duration(track, j);
>>> -                distance++;
>>> +                distance--;
>>>                   if (roll_samples_remaining <= 0)
>>>                       break;
>>>               }
>>> @@ -2555,7 +2561,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>               if (roll_samples_remaining > 0)
>>>                   distance = 0;
>>>               /* Verify distance is a maximum of 32 (2.5ms) packets. */
>>> -            if (distance > 32)
>>> +            if (distance < 32)
>>>                   return AVERROR_INVALIDDATA;
>>>               if (i && distance ==
>>> sgpd_entries[entries].roll_distance) {
>>>                   sgpd_entries[entries].count++;
>>> @@ -2566,10 +2572,22 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>                   sgpd_entries[entries].group_description_index =
>>> distance ? ++group : 0;
>>>               }
>>>           }
>>> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
>>> +        for (i = 0; i < track->entry; i++) {
>>> +            int distance = track->cluster[i].roll_distance;
>>> +            if (i && distance == sgpd_entries[entries].roll_distance) {
>>> +                sgpd_entries[entries].count++;
>>> +            } else {
>>> +                entries++;
>>> +                sgpd_entries[entries].count = 1;
>>> +                sgpd_entries[entries].roll_distance = distance;
>>> +                sgpd_entries[entries].group_description_index =
>>> distance ? ++group : 0;
>>> +            }
>>> +        }
>>>       } else {
>>>           entries++;
>>>           sgpd_entries[entries].count = track->sample_count;
>>> -        sgpd_entries[entries].roll_distance = 1;
>>> +        sgpd_entries[entries].roll_distance = -1;
>>
>> You seem to be negate roll_distance in this patch; shouldn't this be a
>> separate patch?
> 
> I'll split it.
> 
>>
>>>           sgpd_entries[entries].group_description_index = ++group;
>>>       }
>>>       entries++;
>>> @@ -2588,7 +2606,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>       avio_wb32(pb, group); /* entry_count */
>>>       for (i = 0; i < entries; i++) {
>>>           if (sgpd_entries[i].group_description_index) {
>>> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /*
>>> roll_distance */
>>> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /*
>>> roll_distance */
>>>           }
>>>       }
>>>   @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext
>>> *s, AVIOContext *pb, MOVMuxContext
>>>       if (track->cenc.aes_ctr) {
>>>           ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>>>       }
>>> -    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> track->par->codec_id == AV_CODEC_ID_AAC) {
>>> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
>>> +        track->par->codec_id == AV_CODEC_ID_H264) {
>>>           mov_preroll_write_stbl_atoms(pb, track);
>>>       }
>>>       return update_size(pb, pos);
>>> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket
>>> *pkt, uint32_t *flags)
>>>       return 0;
>>>   }
>>>   +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
>>> +{
>>> +    GetBitContext gb;
>>> +    const uint8_t *buf = pkt->data;
>>> +    const uint8_t *buf_end = pkt->data + pkt->size;
>>> +    uint32_t state = -1;
>>> +    unsigned roll_distance = 0;
>>> +    int nal_length_size = 0, nalsize;
>>> +    int idr = 0;
>>> +
>>> +    if (!pkt->size)
>>> +        return 0;
>>> +    if (trk->vos_data && trk->vos_data[0] == 1) {
>>> +        if (trk->vos_len < 5)
>>> +            return 0;
>>> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
>>> +    }
>>> +
>>> +    while (buf_end - buf >= 4) {
>>> +        if (nal_length_size) {
>>> +            int i = 0;
>>> +            nalsize = get_nalsize(nal_length_size, buf, buf_end -
>>> buf, &i, NULL);
>>> +            if (nalsize < 0)
>>> +                break;
>>> +            state = buf[nal_length_size];
>>> +            buf += nal_length_size + 1;
>>> +        } else {
>>> +            buf = avpriv_find_start_code(buf, buf_end, &state);
>>
>> This checks the whole packet for start codes. But given that we have to
>> convert annex B input to ISOBMFF format anyway, this separate codepath
>> seems avoidable by working with already reformatted data here.
>> (In any case: In case this is not an IDR access unit you really check
>> the whole packet, even when you have already encountered a VLC NALU
>> despite SEIs having to be in front of them.)
> 
> I'll look into it.
> 
>>
>>> +            if (buf >= buf_end)
>>> +                break;
>>> +        }
>>> +
>>> +        switch (state & 0x1f) {
>>> +        case H264_NAL_IDR_SLICE:
>>> +            idr = 1;
>>> +            goto end;
>>> +        case H264_NAL_SEI:
>>> +            init_get_bits8(&gb, buf, buf_end - buf);
>>
>> Weird that you checked the init_get_bits8() below but not this one.
> 
> The one below was copy-pasted. Will also check this one.
> 
>>
>>> +            do {
>>> +                GetBitContext gb_payload;
>>> +                int ret, type = 0, size = 0;
>>> +
>>> +                do {
>>> +                    if (get_bits_left(&gb) < 8)
>>> +                        goto end;
>>> +                    type += show_bits(&gb, 8);
>>> +                } while (get_bits(&gb, 8) == 255);
>>> +                do {
>>> +                    if (get_bits_left(&gb) < 8)
>>> +                        goto end;
>>> +                    size += show_bits(&gb, 8);
>>> +                } while (get_bits(&gb, 8) == 255);
>>> +
>>> +                if (size > get_bits_left(&gb) / 8)
>>> +                    goto end;
>>> +
>>> +                ret = init_get_bits8(&gb_payload, gb.buffer +
>>> get_bits_count(&gb) / 8, size);
>>> +                if (ret < 0)
>>> +                    goto end;
>>> +
>>> +                switch (type) {
>>> +                case SEI_TYPE_RECOVERY_POINT:
>>> +                    roll_distance = get_ue_golomb_long(&gb_payload);
>>> +                    break;
>>> +                default:
>>> +                    break;
>>> +                }
>>> +                skip_bits_long(&gb, 8 * size);
>>> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) !=
>>> 0x80);
>>
>> 1. You are using the outer bitreader gb only for byte-aligned
>> reads/skips; better read bytes directly.
> 
> Using GetBitContext is cleaner and clearer IMO, but I could use
> GetByteContext instead i guess. I'd rather not work directly on bytes.
> 

I think we disagree on that: I always hate it when I have to use a
bitreader in case the position is known to be byte-aligned.

>> 2. You are not unescaping the buffer: The SEI size is the size of an
>> unescaped SEI (after 0x03 has been stripped away). Knowing the size in
>> advance comes in really handy for this; this is where only working with
>> mp4-formatted data brings benefits.
> 
> A SEI NALu may have more than one SEI message, and the Recovery Point
> may not be the first. If i can't trust the size read in the loop above
> because it does not take into account the emulation prevention bytes,
> how can i feasibly skip each SEI message within a give NALU?
> 

By unescaping the SEI NALU first. (Alternatively, one could write a
bitreader (necessarily cached) that unescapes internally, but that is
more effort; and it would be slower with bigger NALUs, so the fuzzer
will probably report some new timeouts.)

- Andreas