[FFmpeg-devel] [PATCH 1/2] avfilter: add scale_d3d11 filter

Thu May 22 20:57:44 EEST 2025

On 22-05-2025 20:55, Timo Rothenpieler wrote:
> On 22/05/2025 15:20, Dash Santosh Sathyanarayanan wrote:
>> This commit introduces a new hardware-accelerated video filter, 
>> scale_d3d11,
>> which performs scaling and format conversion using Direct3D 11. The 
>> filter enables
>> efficient GPU-based scaling and pixel format conversion (p010 to 
>> nv12), reducing
>> CPU overhead and latency in video pipelines.
>> ---
>>   Changelog                     |   1 +
>>   libavcodec/decode.c           |   2 +-
>>   libavcodec/dxva2.c            |   3 +
>>   libavfilter/Makefile          |   1 +
>>   libavfilter/allfilters.c      |   1 +
>>   libavfilter/vf_scale_d3d11.c  | 480 ++++++++++++++++++++++++++++++++++
>>   libavutil/hwcontext_d3d11va.c |  40 ++-
>>   7 files changed, 514 insertions(+), 14 deletions(-)
>>   create mode 100644 libavfilter/vf_scale_d3d11.c
>>
>> diff --git a/Changelog b/Changelog
>> index 4217449438..68610a63d0 100644
>> --- a/Changelog
>> +++ b/Changelog
>> @@ -18,6 +18,7 @@ version <next>:
>>   - APV encoding support through a libopenapv wrapper
>>   - VVC decoder supports all content of SCC (Screen Content Coding):
>>     IBC (Inter Block Copy), Palette Mode and ACT (Adaptive Color 
>> Transform
>> +- vf_scale_d3d11 filter
>
> Bit of a nit, this could at last say "Added".
Oops, sorry. My bad.
>
>>       version 7.1:
>> diff --git a/libavcodec/decode.c b/libavcodec/decode.c
>> index c2b2dd6e3b..a796ae7930 100644
>> --- a/libavcodec/decode.c
>> +++ b/libavcodec/decode.c
>> @@ -1079,7 +1079,7 @@ int ff_decode_get_hw_frames_ctx(AVCodecContext 
>> *avctx,
>>       if (frames_ctx->initial_pool_size) {
>>           // We guarantee 4 base work surfaces. The function above 
>> guarantees 1
>>           // (the absolute minimum), so add the missing count.
>> -        frames_ctx->initial_pool_size += 3;
>> +        frames_ctx->initial_pool_size += 33;
>
> This seems a bit extreme, and can potentially drastically increase 
> VRAM usage of anything using d3d11va.
In full hardware pipeline, when all surfaces are in use, we hit pool 
exhaustion and 'static pool size exceeded' error occurs. Hence the 
change. The increase in memory footprint was about ~100mb with this change.
>
>>       }
>>         ret = av_hwframe_ctx_init(avctx->hw_frames_ctx);
>> diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
>> index 22ecd5acaf..37dab6cd68 100644
>> --- a/libavcodec/dxva2.c
>> +++ b/libavcodec/dxva2.c
>> @@ -647,6 +647,9 @@ int ff_dxva2_common_frame_params(AVCodecContext 
>> *avctx,
>>           AVD3D11VAFramesContext *frames_hwctx = frames_ctx->hwctx;
>>             frames_hwctx->BindFlags |= D3D11_BIND_DECODER;
>> +        if (frames_ctx->sw_format == AV_PIX_FMT_NV12) {
>> +            frames_hwctx->BindFlags |= D3D11_BIND_VIDEO_ENCODER;
>> +        }
>
> This change also seems a bit random here. Using NV12 does not 
> automatically mean you'll encode with it.
The encoder requires D3D11_BIND_VIDEO_ENCODER to be set in the input 
surface, when sending the D3D11 surface directly to MF encoder. 
Currently MF encoder supports only 8bit (NV12). Hence the change. If the 
input is 10bit (P010), scale_d3d11 can be configured to output 8bit NV12 
frames.
>
>
> Did not look at the rest yet.
>> +        return AVERROR_EXTERNAL;
>> +    }
>> +
>> +    ///< Set up output frame
>> +    ret = av_frame_copy_props(out, in);
>> +    if (ret < 0) {
>> +        av_log(ctx, AV_LOG_ERROR, "Failed to copy frame properties\n");
>> +        videoContext->lpVtbl->Release(videoContext);
>> +        inputView->lpVtbl->Release(inputView);
>> +        av_frame_free(&in);
>> +        av_frame_free(&out);
>> +        return ret;
>> +    }
>> +
>> +    out->data[0] = (uint8_t *)output_texture;
>> +    out->data[1] = (uint8_t *)(intptr_t)0;
>> +    out->width = s->width;
>> +    out->height = s->height;
>> +    out->format = AV_PIX_FMT_D3D11;
>> +
>> +    ///< Clean up resources
>> +    inputView->lpVtbl->Release(inputView);
>> +    videoContext->lpVtbl->Release(videoContext);
>> +    if (s->outputView) {
>> + s->outputView->lpVtbl->Release(s->outputView);
>> +        s->outputView = NULL;
>> +    }
>> +    av_frame_free(&in);
>> +
>> +    ///< Forward the frame
>> +    return ff_filter_frame(outlink, out);
>> +}
>> +
>> +static int scale_d3d11_config_props(AVFilterLink *outlink)
>> +{
>> +    AVFilterContext *ctx = outlink->src;
>> +    ScaleD3D11Context *s = ctx->priv;
>> +    AVFilterLink *inlink = ctx->inputs[0];
>> +    FilterLink *inl = ff_filter_link(inlink);
>> +    FilterLink *outl = ff_filter_link(outlink);
>> +    int ret;
>> +
>> +    ///< Clean up any previous resources
>> +    release_d3d11_resources(s);
>> +
>> +    ///< Evaluate output dimensions
>> +    ret = ff_scale_eval_dimensions(s, s->w_expr, s->h_expr, inlink, 
>> outlink, &s->width, &s->height);
>> +    if (ret < 0) {
>> +        av_log(ctx, AV_LOG_ERROR, "Failed to evaluate dimensions\n");
>> +        return ret;
>> +    }
>> +
>> +    outlink->w = s->width;
>> +    outlink->h = s->height;
>> +
>> +    ///< Validate input hw_frames_ctx
>> +    if (!inl->hw_frames_ctx) {
>> +        av_log(ctx, AV_LOG_ERROR, "No hw_frames_ctx available on 
>> input link\n");
>> +        return AVERROR(EINVAL);
>> +    }
>> +
>> +    ///< Propagate hw_frames_ctx to output
>> +    outl->hw_frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
>> +    if (!outl->hw_frames_ctx) {
>> +        av_log(ctx, AV_LOG_ERROR, "Failed to propagate hw_frames_ctx 
>> to output\n");
>> +        return AVERROR(ENOMEM);
>> +    }
>> +
>> +    ///< Initialize filter's hardware device context
>> +    if (!s->hw_device_ctx) {
>> +        AVHWFramesContext *in_frames_ctx = (AVHWFramesContext 
>> *)inl->hw_frames_ctx->data;
>> +        s->hw_device_ctx = av_buffer_ref(in_frames_ctx->device_ref);
>> +        if (!s->hw_device_ctx) {
>> +            av_log(ctx, AV_LOG_ERROR, "Failed to initialize filter 
>> hardware device context\n");
>> +            return AVERROR(ENOMEM);
>> +        }
>> +    }
>> +
>> +    ///< Get D3D11 device and context (but don't initialize 
>> processor yet - done in filter_frame)
>> +    AVHWDeviceContext *hwctx = (AVHWDeviceContext 
>> *)s->hw_device_ctx->data;
>> +    AVD3D11VADeviceContext *d3d11_hwctx = (AVD3D11VADeviceContext 
>> *)hwctx->hwctx;
>> +
>> +    s->device = d3d11_hwctx->device;
>> +    s->context = d3d11_hwctx->device_context;
>> +
>> +    if (!s->device || !s->context) {
>> +        av_log(ctx, AV_LOG_ERROR, "Failed to get valid D3D11 device 
>> or context\n");
>> +        return AVERROR(EINVAL);
>> +    }
>> +
>> +    ///< Create new hardware frames context for output
>> +    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext 
>> *)inl->hw_frames_ctx->data;
>> +    s->hw_frames_ctx_out = av_hwframe_ctx_alloc(s->hw_device_ctx);
>> +    if (!s->hw_frames_ctx_out)
>> +        return AVERROR(ENOMEM);
>> +
>> +    enum AVPixelFormat sw_format;
>> +    switch (s->output_format_opt) {
>> +        case OUTPUT_NV12:
>> +            sw_format = AV_PIX_FMT_NV12;
>> +            break;
>> +        case OUTPUT_P010:
>> +            sw_format = AV_PIX_FMT_P010;
>> +            break;
>> +        default:
>> +            return AVERROR(EINVAL);
>> +    }
>> +
>> +    AVHWFramesContext *frames_ctx = (AVHWFramesContext 
>> *)s->hw_frames_ctx_out->data;
>> +    frames_ctx->format = AV_PIX_FMT_D3D11;
>> +    frames_ctx->sw_format = sw_format;
>> +    frames_ctx->width = s->width;
>> +    frames_ctx->height = s->height;
>> +    frames_ctx->initial_pool_size = 30; ///< Adjust pool size as needed
>> +
>> +    AVD3D11VAFramesContext *frames_hwctx = frames_ctx->hwctx;
>> +    frames_hwctx->MiscFlags = 0;
>> +    frames_hwctx->BindFlags = D3D11_BIND_RENDER_TARGET | 
>> D3D11_BIND_VIDEO_ENCODER;
>> +
>> +    ret = av_hwframe_ctx_init(s->hw_frames_ctx_out);
>> +    if (ret < 0) {
>> +        av_buffer_unref(&s->hw_frames_ctx_out);
>> +        return ret;
>> +    }
>> +
>> +    outl->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx_out);
>> +    if (!outl->hw_frames_ctx)
>> +        return AVERROR(ENOMEM);
>> +
>> +    av_log(ctx, AV_LOG_VERBOSE, "D3D11 scale config: %dx%d -> %dx%d\n",
>> +           inlink->w, inlink->h, outlink->w, outlink->h);
>> +    return 0;
>> +}
>> +
>> +static av_cold void scale_d3d11_uninit(AVFilterContext *ctx) {
>> +    ScaleD3D11Context *s = ctx->priv;
>> +
>> +    ///< Release D3D11 resources
>> +    release_d3d11_resources(s);
>> +
>> +    ///< Free the hardware device context reference
>> +    av_buffer_unref(&s->hw_frames_ctx_out);
>> +    av_buffer_unref(&s->hw_device_ctx);
>> +
>> +    ///< Free option strings
>> +    av_freep(&s->w_expr);
>> +    av_freep(&s->h_expr);
>> +}
>> +
>> +static const AVFilterPad scale_d3d11_inputs[] = {
>> +    {
>> +        .name         = "default",
>> +        .type         = AVMEDIA_TYPE_VIDEO,
>> +        .filter_frame = scale_d3d11_filter_frame,
>> +    },
>> +};
>> +
>> +static const AVFilterPad scale_d3d11_outputs[] = {
>> +    {
>> +        .name         = "default",
>> +        .type         = AVMEDIA_TYPE_VIDEO,
>> +        .config_props = scale_d3d11_config_props,
>> +    },
>> +};
>> +
>> +#define OFFSET(x) offsetof(ScaleD3D11Context, x)
>> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
>> +
>> +static const AVOption scale_d3d11_options[] = {
>> +    { "width",  "Output video width",  OFFSET(w_expr), 
>> AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
>> +    { "height", "Output video height", OFFSET(h_expr), 
>> AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
>> +    { "output_fmt", "Output format", OFFSET(output_format_opt), 
>> AV_OPT_TYPE_INT, {.i64 = OUTPUT_NV12}, 0, OUTPUT_P010, FLAGS, "fmt" },
>> +    { "nv12", "NV12 format", 0, AV_OPT_TYPE_CONST, {.i64 = 
>> OUTPUT_NV12}, 0, 0, FLAGS, "fmt" },
>> +    { "p010", "P010 format", 0, AV_OPT_TYPE_CONST, {.i64 = 
>> OUTPUT_P010}, 0, 0, FLAGS, "fmt" },
>> +    { NULL }
>> +};
>> +
>> +AVFILTER_DEFINE_CLASS(scale_d3d11);
>> +
>> +const FFFilter ff_vf_scale_d3d11 = {
>> +    .p.name           = "scale_d3d11",
>> +    .p.description    = NULL_IF_CONFIG_SMALL("Scale video using 
>> Direct3D11"),
>> +    .priv_size        = sizeof(ScaleD3D11Context),
>> +    .p.priv_class     = &scale_d3d11_class,
>> +    .init             = scale_d3d11_init,
>> +    .uninit           = scale_d3d11_uninit,
>> +    FILTER_INPUTS(scale_d3d11_inputs),
>> +    FILTER_OUTPUTS(scale_d3d11_outputs),
>> +    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_D3D11),
>> +    .p.flags          = AVFILTER_FLAG_HWDEVICE,
>> +    .flags_internal   = FF_FILTER_FLAG_HWFRAME_AWARE,
>> +};
>> \ No newline at end of file
>> diff --git a/libavutil/hwcontext_d3d11va.c 
>> b/libavutil/hwcontext_d3d11va.c
>> index 1a047ce57b..36694896e4 100644
>> --- a/libavutil/hwcontext_d3d11va.c
>> +++ b/libavutil/hwcontext_d3d11va.c
>> @@ -82,6 +82,8 @@ typedef struct D3D11VAFramesContext {
>>         int nb_surfaces;
>>       int nb_surfaces_used;
>> +    int retries;
>> +    int max_retries;
>>         DXGI_FORMAT format;
>>   @@ -258,7 +260,9 @@ static AVBufferRef *d3d11va_pool_alloc(void 
>> *opaque, size_t size)
>>       ID3D11Texture2D_GetDesc(hwctx->texture, &texDesc);
>>         if (s->nb_surfaces_used >= texDesc.ArraySize) {
>> -        av_log(ctx, AV_LOG_ERROR, "Static surface pool size 
>> exceeded.\n");
>> +        if (s->retries >= s->max_retries) {
>> +            av_log(ctx, AV_LOG_ERROR, "Static surface pool size 
>> exceeded.\n");
>> +        }
>>           return NULL;
>>       }
>>   @@ -339,20 +343,30 @@ static int 
>> d3d11va_frames_init(AVHWFramesContext *ctx)
>>   static int d3d11va_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
>>   {
>>       AVD3D11FrameDescriptor *desc;
>> +    D3D11VAFramesContext       *s = ctx->hwctx;
>> +    s->retries = 0;
>> +    s->max_retries = 50;
>> +
>> +    while (s->retries < s->max_retries) {
>> +
>> +        frame->buf[0] = av_buffer_pool_get(ctx->pool);
>> +        if (frame->buf[0]) {
>> +            desc = (AVD3D11FrameDescriptor *)frame->buf[0]->data;
>> +
>> +            frame->data[0] = (uint8_t *)desc->texture;
>> +            frame->data[1] = (uint8_t *)desc->index;
>> +            frame->format  = AV_PIX_FMT_D3D11;
>> +            frame->width   = ctx->width;
>> +            frame->height  = ctx->height;
>> +
>> +            return 0;
>> +        }
>>   -    frame->buf[0] = av_buffer_pool_get(ctx->pool);
>> -    if (!frame->buf[0])
>> -        return AVERROR(ENOMEM);
>> -
>> -    desc = (AVD3D11FrameDescriptor *)frame->buf[0]->data;
>> -
>> -    frame->data[0] = (uint8_t *)desc->texture;
>> -    frame->data[1] = (uint8_t *)desc->index;
>> -    frame->format  = AV_PIX_FMT_D3D11;
>> -    frame->width   = ctx->width;
>> -    frame->height  = ctx->height;
>> +        av_usleep(1000);
>> +        s->retries++;
>> +    }
>>   -    return 0;
>> +    return AVERROR(ENOMEM);
>>   }
>>     static int d3d11va_transfer_get_formats(AVHWFramesContext *ctx,
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel at ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".