[FFmpeg-devel] [PATCH] NVENC Surface Allocation Reduction

Tue Apr 25 14:56:07 EEST 2017

> From f3917a452c3e0636c27876e84a4e3b57bb78dae5 Mon Sep 17 00:00:00 2001
> From: Ben Chang <benc at nvidia.com>
> Date: Wed, 19 Apr 2017 13:07:31 -0700
> Subject: [PATCH] NVENC surface allocation reduction
> 
> This patch aims to reduce the number of input/output surfaces
> NVENC allocates per session. Previous default sets allocated surfaces to 32
> (unless there is user specified param or lookahead involved). Having large
> number of surfaces consumes extra video memory (esp for higher resolution
> encoding). The patch changes the surfaces calculation for default, B-frames,
> lookahead scenario respectively.
> 
> The other change involves surface selection. Previously, if a session 
> allocates x surfaces, only x-1 surfaces are used (due to combination 
> of output delay and lock toggle logic). To prevent unused surfaces, 
> changing surface rotation to using predefined fifo.
> ---
>  libavcodec/nvenc.c      | 56 ++++++++++++++++++++++++++++++++++---------------
>  libavcodec/nvenc.h      |  1 +
>  libavcodec/nvenc_h264.c |  4 ++--
>  libavcodec/nvenc_hevc.c |  4 ++--
>  4 files changed, 44 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
> index cf05455..0a1b290 100644
> --- a/libavcodec/nvenc.c
> +++ b/libavcodec/nvenc.c
> @@ -644,16 +644,34 @@ static void nvenc_override_rate_control(AVCodecContext *avctx)
>  static av_cold int nvenc_recalc_surfaces(AVCodecContext *avctx)
>  {
>      NvencContext *ctx = avctx->priv_data;
> -    int nb_surfaces = 0;
> +    // default minimum of 4 surfaces

Did you test if and how much it affects performance to reduce the default delay from 32 to 4?
This was originally done because nvenc is extremely slow if you try to download the frames without some delay headroom.

> +    // *2 for number of NVENCs, *2 for avoid blocking next PBB group

What do you mean by "*2 for number of NVENCs"?

> +    int nb_surfaces = FFMAX(4, ctx->encode_config.frameIntervalP * 2 * 2);
>  
> +    // lookahead enabled
>      if (ctx->rc_lookahead > 0) {
> -        nb_surfaces = ctx->rc_lookahead + ((ctx->encode_config.frameIntervalP > 0) ? ctx->encode_config.frameIntervalP : 0) + 1 + 4;
> -        if (ctx->nb_surfaces < nb_surfaces) {
> +        // +1 is to account for lkd_bound calculation later
> +        // +4 is to allow sufficient pipelining with lookahead
> +        nb_surfaces = FFMAX(1, FFMAX(nb_surfaces, ctx->rc_lookahead + ctx->encode_config.frameIntervalP + 1 + 4));
> +        if ((nb_surfaces > ctx->nb_surfaces) && (ctx->nb_surfaces > 0))

Pointless braces

> +        {
>              av_log(avctx, AV_LOG_WARNING,
>                     "Defined rc_lookahead requires more surfaces, "
>                     "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
> -            ctx->nb_surfaces = nb_surfaces;
>          }
> +        ctx->nb_surfaces = FFMAX(nb_surfaces, ctx->nb_surfaces);
> +    }
> +    else {

Should be on one line.

> +        if ((ctx->encode_config.frameIntervalP > 1) && (ctx->nb_surfaces < nb_surfaces) && (ctx->nb_surfaces > 0))

Pointless braces

> +        {
> +            av_log(avctx, AV_LOG_WARNING,
> +                   "Defined b-frame requires more surfaces, "
> +                   "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
> +            ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, nb_surfaces);
> +        }
> +        else if (!(ctx->nb_surfaces > 0))

Why not just <= 0?

> +            ctx->nb_surfaces = nb_surfaces;
> +        // otherwise use user specified value
>      }
>  
>      ctx->nb_surfaces = FFMAX(1, FFMIN(MAX_REGISTERED_FRAMES, ctx->nb_surfaces));
> @@ -1121,8 +1139,6 @@ static av_cold int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
>          ctx->surfaces[idx].height = allocSurf.height;
>      }
>  
> -    ctx->surfaces[idx].lockCount = 0;
> -
>      /* 1MB is large enough to hold most output frames.
>       * NVENC increases this automaticaly if it is not enough. */
>      allocOut.size = 1024 * 1024;
> @@ -1141,6 +1157,9 @@ static av_cold int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
>      ctx->surfaces[idx].output_surface = allocOut.bitstreamBuffer;
>      ctx->surfaces[idx].size = allocOut.size;
>  
> +    NvencSurface* tmp = &ctx->surfaces[idx];

Mixed code and declaration.
Also, I'd prefer a less generic name, like tmp_surface.

> +    av_fifo_generic_write(ctx->IO_surface_queue, &tmp, sizeof(NvencSurface*), NULL);

For safety and consistency, I'd prefer to have just sizeof(tmp).

> +
>      return 0;
>  }
>  
> @@ -1156,6 +1175,11 @@ static av_cold int nvenc_setup_surfaces(AVCodecContext *avctx)
>      ctx->timestamp_list = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
>      if (!ctx->timestamp_list)
>          return AVERROR(ENOMEM);
> +
> +    ctx->IO_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
> +    if (!ctx->IO_surface_queue)
> +        return AVERROR(ENOMEM);
> +
>      ctx->output_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
>      if (!ctx->output_surface_queue)
>          return AVERROR(ENOMEM);
> @@ -1222,6 +1246,7 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
>      av_fifo_freep(&ctx->timestamp_list);
>      av_fifo_freep(&ctx->output_surface_ready_queue);
>      av_fifo_freep(&ctx->output_surface_queue);
> +    av_fifo_freep(&ctx->IO_surface_queue);
>  
>      if (ctx->surfaces && avctx->pix_fmt == AV_PIX_FMT_CUDA) {
>          for (i = 0; i < ctx->nb_surfaces; ++i) {
> @@ -1305,16 +1330,15 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
>  
>  static NvencSurface *get_free_frame(NvencContext *ctx)
>  {
> -    int i;
> -
> -    for (i = 0; i < ctx->nb_surfaces; i++) {
> -        if (!ctx->surfaces[i].lockCount) {
> -            ctx->surfaces[i].lockCount = 1;
> -            return &ctx->surfaces[i];
> -        }
> +    if (!(av_fifo_size(ctx->IO_surface_queue) > 0))
> +    {
> +        // queue empty
> +        return NULL;
>      }
>  
> -    return NULL;
> +    NvencSurface *tmpSurf;

Mixed code and declaration as well.
Also, please no camel case, I know it's still left in some places,
but I'm gradually trying to only use full lowercase variables with underscores.
So tmp_surf would be nicer here.

> +    av_fifo_generic_read(ctx->IO_surface_queue, &tmpSurf, sizeof(tmpSurf), NULL);
> +    return tmpSurf;
>  }
>  
>  static int nvenc_copy_frame(AVCodecContext *avctx, NvencSurface *nv_surface,
> @@ -1712,7 +1736,6 @@ int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
>          }
>  
>          if (res) {
> -            inSurf->lockCount = 0;
>              return res;
>          }
>  
> @@ -1790,8 +1813,7 @@ int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
>          if (res)
>              return res;
>  
> -        av_assert0(tmpoutsurf->lockCount);
> -        tmpoutsurf->lockCount--;

If the lockCount is now completely unused, which it should be,
it should be removed from the struct.

> +        av_fifo_generic_write(ctx->IO_surface_queue, &tmpoutsurf, sizeof(tmpoutsurf), NULL);
>  
>          *got_packet = 1;
>      } else {
> diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
> index 7dec5cc..5426dec 100644
> --- a/libavcodec/nvenc.h
> +++ b/libavcodec/nvenc.h
> @@ -110,6 +110,7 @@ typedef struct NvencContext
>      int nb_surfaces;
>      NvencSurface *surfaces;
>  
> +    AVFifoBuffer *IO_surface_queue;

I'm not really a fan of this variable name.
First, it should be all lower case, but besides that, wouldn't something
like unused_surface_queue be more clear about what this buffer does?

>      AVFifoBuffer *output_surface_queue;
>      AVFifoBuffer *output_surface_ready_queue;
>      AVFifoBuffer *timestamp_list;
> diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
> index 2c55b60..8d44b1f 100644
> --- a/libavcodec/nvenc_h264.c
> +++ b/libavcodec/nvenc_h264.c
> @@ -79,8 +79,8 @@ static const AVOption options[] = {
>                                                              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
>      { "vbr_2pass",    "Multi-pass variable bitrate mode",   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
>      { "rc-lookahead", "Number of frames to look ahead for rate-control",
> -                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, INT_MAX, VE },
> -    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 32 },  0, MAX_REGISTERED_FRAMES, VE },
> +                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE },

Why the change of default here? Kinda gives up the possibility to differentiate
between unset and user-set to 0.

> +    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, MAX_REGISTERED_FRAMES, VE },
>      { "cbr",          "Use cbr encoding mode",              OFFSET(cbr),          AV_OPT_TYPE_BOOL,  { .i64 = 0 },   0, 1, VE },
>      { "2pass",        "Use 2pass encoding mode",            OFFSET(twopass),      AV_OPT_TYPE_BOOL,  { .i64 = -1 }, -1, 1, VE },
>      { "gpu",          "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.",
> diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
> index c32ba42..6d6750a 100644
> --- a/libavcodec/nvenc_hevc.c
> +++ b/libavcodec/nvenc_hevc.c
> @@ -78,8 +78,8 @@ static const AVOption options[] = {
>                                                              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
>      { "vbr_2pass",    "Multi-pass variable bitrate mode",   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
>      { "rc-lookahead", "Number of frames to look ahead for rate-control",
> -                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, INT_MAX, VE },
> -    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 32 },  0, MAX_REGISTERED_FRAMES, VE },
> +                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE },
> +    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, MAX_REGISTERED_FRAMES, VE },
>      { "cbr",          "Use cbr encoding mode",              OFFSET(cbr),          AV_OPT_TYPE_BOOL,  { .i64 = 0 },   0, 1, VE },
>      { "2pass",        "Use 2pass encoding mode",            OFFSET(twopass),      AV_OPT_TYPE_BOOL,  { .i64 = -1 }, -1, 1, VE },
>      { "gpu",          "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.",
> -- 
> 2.9.1

But in general a very nice set of changes, using a fifo for the unused surfaces
seems way more natural than the lock "counter" used before.