[Libav-user] Copying source video timestamps to destination [3.1.5]

Thu Oct 27 06:07:13 EEST 2016

I am writing an application that decodes a single video stream from an
input file (any codec, any container), does a bunch of image processing,
and encodes the results to an output file (single video stream, Quicktime
RLE, MOV).

There is a 1:1 correspondence between input and output frames and I want
the frame timing in the output to be identical to the input. I am having a
really, *really* hard time accomplishing this. So my general question is: *How
do I reliably (as in, in all cases of inputs) set the output frame timing
identical to the input?*

It took me a very long time to slog through the API and get to the point I
am at now. I put together a minimal test program to work with:

#include <cstdio>

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/avutil.h>
#include <libavutil/imgutils.h>
#include <libswscale/swscale.h>
}

using namespace std;

struct DecoderStuff {
    AVFormatContext *formatx;
    int nstream;
    AVCodec *codec;
    AVStream *stream;
    AVCodecContext *codecx;
    AVFrame *rawframe;
    AVFrame *rgbframe;
    SwsContext *swsx;
};

struct EncoderStuff {
    AVFormatContext *formatx;
    AVCodec *codec;
    AVStream *stream;
    AVCodecContext *codecx;
};

template <typename T>
static void dump_timebase (const char *what, const T *o) {
    if (o)
        printf("%s timebase: %d/%d\n", what, o->time_base.num,
o->time_base.den);
    else
        printf("%s timebase: null object\n", what);
}

// reads next frame into d.rawframe and d.rgbframe. returns false on error/eof.
static bool read_frame (DecoderStuff &d) {

    AVPacket packet;
    int err = 0, haveframe = 0;

    // read
    while (!haveframe && err >= 0 && ((err = av_read_frame(d.formatx,
&packet)) >= 0)) {
       if (packet.stream_index == d.nstream) {
           err = avcodec_decode_video2(d.codecx, d.rawframe,
&haveframe, &packet);
       }
       av_packet_unref(&packet);
    }

    // error output
    if (!haveframe && err != AVERROR_EOF) {
        char buf[500];
        av_strerror(err, buf, sizeof(buf) - 1);
        buf[499] = 0;
        printf("read_frame: %s\n", buf);
    }

    // convert to rgb
    if (haveframe) {
        sws_scale(d.swsx, d.rawframe->data, d.rawframe->linesize, 0,
d.rawframe->height,
                  d.rgbframe->data, d.rgbframe->linesize);
    }

    return haveframe;

}

// writes an output frame, returns false on error.
static bool write_frame (EncoderStuff &e, AVFrame *inframe) {

    // see note in so post about outframe here
    AVFrame *outframe = av_frame_alloc();
    outframe->format = inframe->format;
    outframe->width = inframe->width;
    outframe->height = inframe->height;
    av_image_alloc(outframe->data, outframe->linesize,
outframe->width, outframe->height,
                   AV_PIX_FMT_RGB24, 1);
    //av_frame_copy(outframe, inframe);
    static int count = 0;
    for (int n = 0; n < outframe->width * outframe->height; ++ n) {
        outframe->data[0][n*3+0] = ((n+count) % 100) ? 0 : 255;
        outframe->data[0][n*3+1] = ((n+count) % 100) ? 0 : 255;
        outframe->data[0][n*3+2] = ((n+count) % 100) ? 0 : 255;
    }
    ++ count;

    AVPacket packet;
    av_init_packet(&packet);
    packet.size = 0;
    packet.data = NULL;

    int err, havepacket = 0;
    if ((err = avcodec_encode_video2(e.codecx, &packet, outframe,
&havepacket)) >= 0 && havepacket) {
        packet.stream_index = e.stream->index;
        err = av_interleaved_write_frame(e.formatx, &packet);
    }

    if (err < 0) {
        char buf[500];
        av_strerror(err, buf, sizeof(buf) - 1);
        buf[499] = 0;
        printf("write_frame: %s\n", buf);
    }

    av_packet_unref(&packet);
    av_freep(&outframe->data[0]);
    av_frame_free(&outframe);

    return err >= 0;

}

int main (int argc, char *argv[]) {

    const char *infile = "wildlife.wmv";
    const char *outfile = "test.mov";
    DecoderStuff d = {};
    EncoderStuff e = {};

    av_register_all();

    // decoder
    avformat_open_input(&d.formatx, infile, NULL, NULL);
    avformat_find_stream_info(d.formatx, NULL);
    d.nstream = av_find_best_stream(d.formatx, AVMEDIA_TYPE_VIDEO, -1,
-1, &d.codec, 0);
    d.stream = d.formatx->streams[d.nstream];
    d.codecx = avcodec_alloc_context3(d.codec);
    avcodec_parameters_to_context(d.codecx, d.stream->codecpar);
    avcodec_open2(d.codecx, NULL, NULL);
    d.rawframe = av_frame_alloc();
    d.rgbframe = av_frame_alloc();
    d.rgbframe->format = AV_PIX_FMT_RGB24;
    d.rgbframe->width = d.codecx->width;
    d.rgbframe->height = d.codecx->height;
    av_frame_get_buffer(d.rgbframe, 1);
    d.swsx = sws_getContext(d.codecx->width, d.codecx->height,
d.codecx->pix_fmt,
                            d.codecx->width, d.codecx->height, AV_PIX_FMT_RGB24,
                            SWS_POINT, NULL, NULL, NULL);
    //av_dump_format(d.formatx, 0, infile, 0);
    dump_timebase("in stream", d.stream);
    dump_timebase("in stream:codec", d.stream->codec); // note: deprecated
    dump_timebase("in codec", d.codecx);

    // encoder
    avformat_alloc_output_context2(&e.formatx, NULL, NULL, outfile);
    e.codec = avcodec_find_encoder(AV_CODEC_ID_QTRLE);
    e.stream = avformat_new_stream(e.formatx, e.codec);
    e.codecx = avcodec_alloc_context3(e.codec);
    e.codecx->bit_rate = 4000000; // arbitrary for qtrle
    e.codecx->width = d.codecx->width;
    e.codecx->height = d.codecx->height;
    e.codecx->gop_size = 30; // 99% sure this is arbitrary for qtrle
    e.codecx->pix_fmt = AV_PIX_FMT_RGB24;
    e.codecx->time_base = d.stream->time_base; // ???
    e.codecx->flags |= (e.formatx->flags & AVFMT_GLOBALHEADER) ?
AV_CODEC_FLAG_GLOBAL_HEADER : 0;
    avcodec_open2(e.codecx, NULL, NULL);
    avcodec_parameters_from_context(e.stream->codecpar, e.codecx);
    //av_dump_format(e.formatx, 0, outfile, 1);
    dump_timebase("out stream", e.stream);
    dump_timebase("out stream:codec", e.stream->codec); // note: deprecated
    dump_timebase("out codec", e.codecx);

    // open file and write header
    avio_open(&e.formatx->pb, outfile, AVIO_FLAG_WRITE);
    avformat_write_header(e.formatx, NULL);

    // frames
    while (read_frame(d) && write_frame(e, d.rgbframe))
        ;

    // write trailer and close file
    av_write_trailer(e.formatx);
    avio_closep(&e.formatx->pb);

}

A few notes about that:

   - Since all of my attempts at frame timing so far have failed, I’ve
   removed almost all timing-related stuff from this code to start with a
   clean slate.
   - Almost all error checking and cleanup omitted for brevity.
   - The reason I allocate a new output frame with a new buffer in
   write_frame, rather than using inframe directly, is because this is more
   representative of what my real application is doing. My real app also uses
   RGB24 internally, hence the conversions here.
   - The reason I generate a weird pattern in outframe, rather than using
   e.g. av_copy_frame, is because I just wanted a test pattern that
   compressed well with Quicktime RLE (my test input ends up generating a
   1.7GB output file otherwise).
   - The input video I am using, “wildlife.wmv”, can be found here
   <https://www.dropbox.com/s/rld7uwsp9hvtvv1/wildlife.wmv?dl=0>. I’ve
   hard-coded the filenames.
   - I am aware that avcodec_decode_video2 and avcodec_encode_video2 are
   deprecated, but don’t care. They work fine, I’ve already struggled too much
   getting my head around the latest version of the API, ffmpeg changes their
   API with nearly every release, and I really don’t feel like dealing with
   avcodec_send_* and avcodec_receive_*
   <https://ffmpeg.org/doxygen/3.1/group__lavc__encdec.html> right now.
   - I think I’m supposed to be finishing off by passing a NULL frame to
   avcodec_encode_video2
   <https://ffmpeg.org/doxygen/3.1/group__lavc__encoding.html#ga2c08a4729f72f9bdac41b5533c4f2642>
to
   flush some buffers or something but I’m a bit confused about that. Unless
   somebody feels like explaining that let’s ignore it for now, it’s a
   separate question. The docs are as vague about this point as they are about
   everything else.
   - My test input file’s frame rate is 29.97.

------------------------------

Now, as for my current attempts. The following timing related fields are
present in the above code, with details/confusion in bold. There’s a lot of
them, because the API is mind-bogglingly convoluted:

   - main: d.stream->time_base: Input video stream time base. *For my test
   input file this is 1/1000.*
   - main: d.stream->codec->time_base: Not sure what this is (I never could
   make sense of why AVStream has an AVCodecContext field when you always
   use your own new context anyways) and also the codec field is
   deprecated. *For my test input file this is 1/1000.*
   - main: d.codecx->time_base: Input codec context time-base. *For my test
   input file this is 0/1. Am I supposed to set it?*
   - main: e.stream->time_base: Time base of the output stream I create. *What
   do I set this to?*
   - main: e.stream->codec->time_base: Time base of the deprecated and
   mysterious codec field of the output stream I create. *Do I set this to
   anything?*
   - main: e.codecx->time_base: Time base of the encoder context I
create. *What
   do I set this to?*
   - read_frame: packet.dts: Decoding timestamp of packet read.
   - read_frame: packet.pts: Presentation timestamp of packet read.
   - read_frame: packet.duration: Duration of packet read.
   - read_frame: d.rawframe->pts: Presentation timestamp of raw frame
   decoded. *This is always 0. Why isn’t it read by the decoder…?*
   - read_frame: d.rgbframe->pts / write_frame: inframe->pts: Presentation
   timestamp of decoded frame converted to RGB. Not set to anything currently.
   - read_frame: d.rawframe->pkt_*: Fields copied from packet, discovered
   after reading this post <http://stackoverflow.com/a/18842961/616460>.
   They are set correctly but I don’t know if they are useful.
   - write_frame: outframe->pts: Presentation timestamp of frame being
   encoded. *Should I set this to something?*
   - write_frame: outframe->pkt_*: Timing fields from a packet. *Should I
   set these? They seem to be ignored by the encoder.*
   - write_frame: packet.dts: Decoding timestamp of packet being encoded. *What
   do I set it to?*
   - write_frame: packet.pts: Presentation timestamp of packet being
   encoded. *What do I set it to?*
   - write_frame: packet.duration: Duration of packet being encoded. *What
   do I set it to?*

I have tried the following, with the described results. Note that inframe
 is d.rgbframe:

   1.

   - Init e.stream->time_base = d.stream->time_base
   - Init e.codecx->time_base = d.codecx->time_base
   - Set e.rgbframe->pts = packet.dts in read_frame
   - Set outframe->pts = inframe->pts in write_frame
   - Result: Warning that encoder time base is not set (since
d.codecx->time_base
   was 0/1), seg fault.

   1.

   - Init e.stream->time_base = d.stream->time_base
   - Init e.codecx->time_base = d.stream->time_base
   - Set e.rgbframe->pts = packet.dts in read_frame
   - Set outframe->pts = inframe->pts in write_frame
   - Result: No warnings, but VLC reports frame rate as 480.048 (no idea
   where this number came from) and file plays too fast. Also the encoder sets
   all the timing fields in packet to 0, which was not what I expected.

   1.

   - Init e.stream->time_base = d.stream->time_base
   - Init e.codecx->time_base = d.stream->time_base
   - Set e.rgbframe->pts = packet.dts in read_frame
   - Set any of pts/dts/duration in packet in write_frame to anything.
   - Result: Warnings about packet timestamps not set. Encoder seems to
   reset all packet timing fields to 0, so none of this has any effect.

   1.

   - Init e.stream->time_base = d.stream->time_base
   - Init e.codecx->time_base = d.stream->time_base
   - I found these fields, pkt_pts, pkt_dts, and pkt_duration in AVFrame after
   reading this post <http://stackoverflow.com/a/18842961/616460>, so I
   tried copying those all the way through to outframe.
   - Result: Really had my hopes up, but ended up with same results as
   attempt 3 (packet timestamp not set warning, incorrect results).

I tried various other hand-wavey permutations of the above and nothing
worked. What I *want* to do is create an output file that plays back with
the same timing and frame rate as the input (29.97 constant frame rate in
this case).

*So how do I do this?* Of the zillions of timing related fields here, what
do I do to make the output be the same as the input? And how do I do it in
such a way that handles arbitrary video input formats that may store their
time stamps and time bases in different places? I need this to always work.

I realize this is a long post. It’s also the shortest I could make it. The
sheer complexity of this post is indicative of how desparately frustrated I
am with ffmpeg right now.
------------------------------

For reference, here is a table of all the packet and frame timestamps read
from the video stream of my test input file, to give a sense of what my
test file looks like. None of the input packet pts’ are set, same with
frame pts, and for some reason the duration of the first 108 frames is 0.
VLC plays the file fine and reports the frame rate as 29.9700089:

   - Table is here <http://pastebin.com/6EYsmuTp>.

Thanks,
J
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://ffmpeg.org/pipermail/libav-user/attachments/20161026/e8f7ea75/attachment.html>