[FFmpeg-devel] [PATCH] dxva2: add AV1 decode support

Thu Nov 5 18:45:33 EET 2020

On 11/5/2020 12:53 PM, Hendrik Leppkes wrote:
> ---
>   Changelog              |   1 +
>   configure              |   7 +
>   libavcodec/Makefile    |   2 +
>   libavcodec/av1dec.c    |  25 +-
>   libavcodec/dxva2.c     |  10 +-
>   libavcodec/dxva2_av1.c | 504 +++++++++++++++++++++++++++++++++++++++++
>   libavcodec/hwaccels.h  |   3 +
>   libavcodec/version.h   |   2 +-
>   8 files changed, 550 insertions(+), 4 deletions(-)
>   create mode 100644 libavcodec/dxva2_av1.c
> 
> diff --git a/Changelog b/Changelog
> index 3fdcafc355..886e69a1cc 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -40,6 +40,7 @@ version <next>:
>   - High Voltage Software ADPCM encoder
>   - LEGO Racers ALP (.tun & .pcm) muxer
>   - AV1 VAAPI decoder
> +- DXVA2/D3D11VA hardware accelerated AV1 decoding
>   
>   
>   version 4.3:
> diff --git a/configure b/configure
> index 8a9e9b3cd7..e55e910477 100755
> --- a/configure
> +++ b/configure
> @@ -2918,6 +2918,12 @@ videotoolbox_hwaccel_deps="videotoolbox pthreads"
>   videotoolbox_hwaccel_extralibs="-framework QuartzCore"
>   xvmc_deps="X11_extensions_XvMClib_h"
>   
> +av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
> +av1_d3d11va_hwaccel_select="av1_decoder"
> +av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
> +av1_d3d11va2_hwaccel_select="av1_decoder"
> +av1_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_AV1"
> +av1_dxva2_hwaccel_select="av1_decoder"
>   av1_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferAV1_bit_depth_idx"
>   av1_vaapi_hwaccel_select="av1_decoder"
>   h263_vaapi_hwaccel_deps="vaapi"
> @@ -6203,6 +6209,7 @@ enabled videotoolbox && {
>   
>   check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
>   
> +check_type "windows.h dxva.h" "DXVA_PicParams_AV1" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h dxva.h" "DXVA_PicParams_HEVC" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h dxva.h" "DXVA_PicParams_VP9" -DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP -D_CRT_BUILD_DESKTOP_APP=0
>   check_type "windows.h d3d11.h" "ID3D11VideoDecoder"
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 9d75dd68af..505960df0a 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -912,6 +912,8 @@ OBJS-$(CONFIG_VAAPI)                      += vaapi_decode.o
>   OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
>   OBJS-$(CONFIG_VDPAU)                      += vdpau.o
>   
> +OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL)        += dxva2_av1.o
> +OBJS-$(CONFIG_AV1_DXVA2_HWACCEL)          += dxva2_av1.o
>   OBJS-$(CONFIG_AV1_VAAPI_HWACCEL)          += vaapi_av1.o
>   OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
>   OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
> diff --git a/libavcodec/av1dec.c b/libavcodec/av1dec.c
> index 56712279aa..01cf92fab5 100644
> --- a/libavcodec/av1dec.c
> +++ b/libavcodec/av1dec.c
> @@ -215,7 +215,7 @@ static int get_pixel_format(AVCodecContext *avctx)
>       uint8_t bit_depth;
>       int ret;
>       enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
> -#define HWACCEL_MAX (CONFIG_AV1_VAAPI_HWACCEL)
> +#define HWACCEL_MAX (CONFIG_AV1_DXVA2_HWACCEL + CONFIG_AV1_D3D11VA_HWACCEL * 2 + CONFIG_AV1_VAAPI_HWACCEL)
>       enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
>   
>       if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
> @@ -278,11 +278,25 @@ static int get_pixel_format(AVCodecContext *avctx)
>   
>       switch (s->pix_fmt) {
>       case AV_PIX_FMT_YUV420P:
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
> +        *fmtp++ = AV_PIX_FMT_D3D11;
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           *fmtp++ = AV_PIX_FMT_VAAPI;
>   #endif
>           break;
>       case AV_PIX_FMT_YUV420P10:
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
> +        *fmtp++ = AV_PIX_FMT_D3D11;
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           *fmtp++ = AV_PIX_FMT_VAAPI;
>   #endif
> @@ -853,6 +867,15 @@ AVCodec ff_av1_decoder = {
>       .flush                 = av1_decode_flush,
>       .profiles              = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
>       .hw_configs            = (const AVCodecHWConfigInternal * []) {
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +        HWACCEL_DXVA2(av1),
> +#endif
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +        HWACCEL_D3D11VA(av1),
> +#endif
> +#if CONFIG_AV1_D3D11VA2_HWACCEL
> +        HWACCEL_D3D11VA2(av1),
> +#endif
>   #if CONFIG_AV1_VAAPI_HWACCEL
>           HWACCEL_VAAPI(av1),
>   #endif
> diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
> index 32416112bf..b57ea21941 100644
> --- a/libavcodec/dxva2.c
> +++ b/libavcodec/dxva2.c
> @@ -45,6 +45,7 @@ DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main,  0x5b11d51b, 0x2f4c,0x4452,0xbc,0xc3,0x0
>   DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main10,0x107af0e0, 0xef1a,0x4d19,0xab,0xa8,0x67,0xa1,0x63,0x07,0x3d,0x13);
>   DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_Profile0,0x463707f8,0xa1d0,0x4585,0x87,0x6d,0x83,0xaa,0x6d,0x60,0xb8,0x9e);
>   DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_10bit_Profile2,0xa4c749ef,0x6ecf,0x48aa,0x84,0x48,0x50,0xa7,0xa1,0x16,0x5f,0xf7);
> +DEFINE_GUID(ff_DXVA2_ModeAV1_VLD_Profile0,0xb8be4ccb,0xcf53,0x46ba,0x8d,0x59,0xd6,0xb8,0xa6,0xda,0x5d,0x2a);
>   DEFINE_GUID(ff_DXVA2_NoEncrypt,          0x1b81beD0, 0xa0c7,0x11d3,0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5);
>   DEFINE_GUID(ff_GUID_NULL,                0x00000000, 0x0000,0x0000,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
>   DEFINE_GUID(ff_IID_IDirectXVideoDecoderService, 0xfc51a551,0xd5e7,0x11d9,0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02);
> @@ -72,6 +73,8 @@ static const int prof_vp9_profile0[] = {FF_PROFILE_VP9_0,
>                                           FF_PROFILE_UNKNOWN};
>   static const int prof_vp9_profile2[] = {FF_PROFILE_VP9_2,
>                                           FF_PROFILE_UNKNOWN};
> +static const int prof_av1_profile0[] = {FF_PROFILE_AV1_MAIN,
> +                                        FF_PROFILE_UNKNOWN};
>   
>   static const dxva_mode dxva_modes[] = {
>       /* MPEG-2 */
> @@ -98,6 +101,9 @@ static const dxva_mode dxva_modes[] = {
>       { &ff_DXVA2_ModeVP9_VLD_Profile0,       AV_CODEC_ID_VP9, prof_vp9_profile0 },
>       { &ff_DXVA2_ModeVP9_VLD_10bit_Profile2, AV_CODEC_ID_VP9, prof_vp9_profile2 },
>   
> +    /* AV1 */
> +    { &ff_DXVA2_ModeAV1_VLD_Profile0,       AV_CODEC_ID_AV1, prof_av1_profile0 },
> +
>       { NULL,                          0 },
>   };
>   
> @@ -604,7 +610,7 @@ int ff_dxva2_common_frame_params(AVCodecContext *avctx,
>           surface_alignment = 32;
>       /* the HEVC DXVA2 spec asks for 128 pixel aligned surfaces to ensure
>       all coding features have enough room to work with */
> -    else if (avctx->codec_id == AV_CODEC_ID_HEVC)
> +    else if (avctx->codec_id == AV_CODEC_ID_HEVC || avctx->codec_id == AV_CODEC_ID_AV1)
>           surface_alignment = 128;
>       else
>           surface_alignment = 16;
> @@ -615,7 +621,7 @@ int ff_dxva2_common_frame_params(AVCodecContext *avctx,
>       /* add surfaces based on number of possible refs */
>       if (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_HEVC)
>           num_surfaces += 16;
> -    else if (avctx->codec_id == AV_CODEC_ID_VP9)
> +    else if (avctx->codec_id == AV_CODEC_ID_VP9 || avctx->codec_id == AV_CODEC_ID_AV1)
>           num_surfaces += 8;
>       else
>           num_surfaces += 2;
> diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c
> new file mode 100644
> index 0000000000..d04c96becf
> --- /dev/null
> +++ b/libavcodec/dxva2_av1.c
> @@ -0,0 +1,504 @@
> +/*
> + * DXVA2 AV1 HW acceleration.
> + *
> + * copyright (c) 2020 Hendrik Leppkes
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "dxva2_internal.h"
> +#include "av1dec.h"
> +
> +#define MAX_TILES 256
> +
> +struct AV1DXVAContext {
> +    FFDXVASharedContext shared;
> +
> +    unsigned int bitstream_allocated;
> +    uint8_t *bitstream_cache;
> +};
> +
> +struct av1_dxva2_picture_context {
> +    DXVA_PicParams_AV1    pp;
> +    unsigned              tile_count;
> +    DXVA_Tile_AV1         tiles[MAX_TILES];
> +    uint8_t              *bitstream;
> +    unsigned              bitstream_size;
> +};
> +
> +static int get_bit_depth_from_seq(const AV1RawSequenceHeader *seq)
> +{
> +    if (seq->seq_profile == 2 && seq->color_config.high_bitdepth)
> +        return seq->color_config.twelve_bit ? 12 : 10;
> +    else if (seq->seq_profile <= 2 && seq->color_config.high_bitdepth)
> +        return 10;
> +    else
> +        return 8;
> +}
> +
> +static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *ctx, const AV1DecContext *h,
> +                                    DXVA_PicParams_AV1 *pp)
> +{
> +    int i,j, uses_lr;
> +    const AV1RawSequenceHeader *seq = h->raw_seq;
> +    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
> +
> +    unsigned char remap_lr_type[4] = { AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ };
> +
> +    memset(pp, 0, sizeof(*pp));
> +
> +    pp->width  = avctx->width;
> +    pp->height = avctx->height;
> +
> +    pp->max_width  = seq->max_frame_width_minus_1 + 1;
> +    pp->max_height = seq->max_frame_height_minus_1 + 1;
> +
> +    pp->CurrPicTextureIndex = ff_dxva2_get_surface_index(avctx, ctx, h->cur_frame.tf.f);
> +    pp->superres_denom      = frame_header->use_superres ? frame_header->coded_denom : AV1_SUPERRES_NUM;
> +    pp->bitdepth            = get_bit_depth_from_seq(seq);
> +    pp->seq_profile         = seq->seq_profile;
> +
> +    /* Tiling info */
> +    pp->tiles.cols = frame_header->tile_cols;
> +    pp->tiles.rows = frame_header->tile_rows;
> +    pp->tiles.context_update_id = frame_header->context_update_tile_id;
> +
> +    for (i = 0; i < pp->tiles.cols; i++)
> +        pp->tiles.widths[i] = frame_header->width_in_sbs_minus_1[i] + 1;
> +
> +    for (i = 0; i < pp->tiles.rows; i++)
> +        pp->tiles.heights[i] = frame_header->height_in_sbs_minus_1[i] + 1;
> +
> +    /* Coding tools */
> +    pp->coding.use_128x128_superblock       = seq->use_128x128_superblock;
> +    pp->coding.intra_edge_filter            = seq->enable_intra_edge_filter;
> +    pp->coding.interintra_compound          = seq->enable_interintra_compound;
> +    pp->coding.masked_compound              = seq->enable_masked_compound;
> +    pp->coding.warped_motion                = frame_header->allow_warped_motion;
> +    pp->coding.dual_filter                  = seq->enable_dual_filter;
> +    pp->coding.jnt_comp                     = seq->enable_jnt_comp;
> +    pp->coding.screen_content_tools         = frame_header->allow_screen_content_tools;
> +    pp->coding.integer_mv                   = frame_header->force_integer_mv || !(frame_header->frame_type & 1);
> +    pp->coding.cdef                         = seq->enable_cdef;
> +    pp->coding.restoration                  = seq->enable_restoration;
> +    pp->coding.film_grain                   = seq->film_grain_params_present;
> +    pp->coding.intrabc                      = frame_header->allow_intrabc;
> +    pp->coding.high_precision_mv            = frame_header->allow_high_precision_mv;
> +    pp->coding.switchable_motion_mode       = frame_header->is_motion_mode_switchable;
> +    pp->coding.filter_intra                 = seq->enable_filter_intra;
> +    pp->coding.disable_frame_end_update_cdf = frame_header->disable_frame_end_update_cdf;
> +    pp->coding.disable_cdf_update           = frame_header->disable_cdf_update;
> +    pp->coding.reference_mode               = frame_header->reference_select;
> +    pp->coding.skip_mode                    = frame_header->skip_mode_present;
> +    pp->coding.reduced_tx_set               = frame_header->reduced_tx_set;
> +    pp->coding.superres                     = frame_header->use_superres;
> +    pp->coding.tx_mode                      = frame_header->tx_mode;
> +    pp->coding.use_ref_frame_mvs            = frame_header->use_ref_frame_mvs;
> +    pp->coding.enable_ref_frame_mvs         = seq->enable_ref_frame_mvs;
> +    pp->coding.reference_frame_update       = !(frame_header->show_existing_frame == 1 && frame_header->frame_type == AV1_FRAME_KEY);

hwaccel->start_frame() is not called for 
frame_header->show_existing_frame == 1 frames (Those are essentially 
just a header telling the decoder to output a previously decoded frame, 
and maybe update the reference frame state), so that check is 
superfluous, and by extension the whole thing. Just hardcode it to 1.

Is this field documented anywhere?

> +
> +    /* Format & Picture Info flags */
> +    pp->format.frame_type     = frame_header->frame_type;
> +    pp->format.show_frame     = frame_header->show_frame;
> +    pp->format.showable_frame = frame_header->showable_frame;
> +    pp->format.subsampling_x  = seq->color_config.subsampling_x;
> +    pp->format.subsampling_y  = seq->color_config.subsampling_y;
> +    pp->format.mono_chrome    = seq->color_config.mono_chrome;
> +
> +    /* References */
> +    pp->primary_ref_frame = frame_header->primary_ref_frame;
> +    pp->order_hint        = frame_header->order_hint;
> +    pp->order_hint_bits   = seq->enable_order_hint ? seq->order_hint_bits_minus_1 + 1 : 0;
> +
> +    memset(pp->RefFrameMapTextureIndex, 0xFF, sizeof(pp->RefFrameMapTextureIndex));
> +    for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
> +        int8_t ref_idx = frame_header->ref_frame_idx[i];
> +        AVFrame *ref_frame = h->ref[ref_idx].tf.f;
> +
> +        pp->frame_refs[i].width  = ref_frame->width;
> +        pp->frame_refs[i].height = ref_frame->height;
> +        pp->frame_refs[i].Index  = ref_frame->buf[0] ? ref_idx : 0xFF;
> +
> +        /* Global Motion */
> +        pp->frame_refs[i].wminvalid = (h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i] == AV1_WARP_MODEL_IDENTITY);
> +        pp->frame_refs[i].wmtype    = h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i];
> +        for (j = 0; j < 6; ++j) {
> +             pp->frame_refs[i].wmmat[j] = h->cur_frame.gm_params[AV1_REF_FRAME_LAST + i][j];
> +        }
> +    }
> +    for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
> +        AVFrame *ref_frame = h->ref[i].tf.f;
> +        if (ref_frame->buf[0])
> +            pp->RefFrameMapTextureIndex[i] = ff_dxva2_get_surface_index(avctx, ctx, ref_frame);
> +    }
> +
> +    /* Loop filter parameters */
> +    pp->loop_filter.filter_level[0]        = frame_header->loop_filter_level[0];
> +    pp->loop_filter.filter_level[1]        = frame_header->loop_filter_level[1];
> +    pp->loop_filter.filter_level_u         = frame_header->loop_filter_level[2];
> +    pp->loop_filter.filter_level_v         = frame_header->loop_filter_level[3];
> +    pp->loop_filter.sharpness_level        = frame_header->loop_filter_sharpness;
> +    pp->loop_filter.mode_ref_delta_enabled = frame_header->loop_filter_delta_enabled;
> +    pp->loop_filter.mode_ref_delta_update  = frame_header->loop_filter_delta_update;
> +    pp->loop_filter.delta_lf_multi         = frame_header->delta_lf_multi;
> +    pp->loop_filter.delta_lf_present       = frame_header->delta_lf_present;
> +    pp->loop_filter.delta_lf_res           = frame_header->delta_lf_res;
> +
> +    for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) {
> +        pp->loop_filter.ref_deltas[i] = frame_header->loop_filter_ref_deltas[i];
> +    }
> +
> +    pp->loop_filter.mode_deltas[0]                = frame_header->loop_filter_mode_deltas[0];
> +    pp->loop_filter.mode_deltas[1]                = frame_header->loop_filter_mode_deltas[1];
> +    pp->loop_filter.frame_restoration_type[0]     = remap_lr_type[frame_header->lr_type[0]];
> +    pp->loop_filter.frame_restoration_type[1]     = remap_lr_type[frame_header->lr_type[1]];
> +    pp->loop_filter.frame_restoration_type[2]     = remap_lr_type[frame_header->lr_type[2]];
> +    uses_lr = frame_header->lr_type[0] || frame_header->lr_type[1] || frame_header->lr_type[2];
> +    pp->loop_filter.log2_restoration_unit_size[0] = uses_lr ? (6 + frame_header->lr_unit_shift) : 8;
> +    pp->loop_filter.log2_restoration_unit_size[1] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
> +    pp->loop_filter.log2_restoration_unit_size[2] = uses_lr ? (6 + frame_header->lr_unit_shift - frame_header->lr_uv_shift) : 8;
> +
> +    /* Quantization */
> +    pp->quantization.delta_q_present = frame_header->delta_q_present;
> +    pp->quantization.delta_q_res     = frame_header->delta_q_res;
> +    pp->quantization.base_qindex     = frame_header->base_q_idx;
> +    pp->quantization.y_dc_delta_q    = frame_header->delta_q_y_dc;
> +    pp->quantization.u_dc_delta_q    = frame_header->delta_q_u_dc;
> +    pp->quantization.v_dc_delta_q    = frame_header->delta_q_v_dc;
> +    pp->quantization.u_ac_delta_q    = frame_header->delta_q_u_ac;
> +    pp->quantization.v_ac_delta_q    = frame_header->delta_q_v_ac;
> +    pp->quantization.qm_y            = frame_header->using_qmatrix ? frame_header->qm_y : 0xFF;
> +    pp->quantization.qm_u            = frame_header->using_qmatrix ? frame_header->qm_u : 0xFF;
> +    pp->quantization.qm_v            = frame_header->using_qmatrix ? frame_header->qm_v : 0xFF;
> +
> +    /* Cdef parameters */
> +    pp->cdef.damping = frame_header->cdef_damping_minus_3;
> +    pp->cdef.bits    = frame_header->cdef_bits;
> +    for (i = 0; i < 8; i++) {
> +        pp->cdef.y_strengths[i].primary    = frame_header->cdef_y_pri_strength[i];
> +        pp->cdef.y_strengths[i].secondary  = frame_header->cdef_y_sec_strength[i];
> +        pp->cdef.uv_strengths[i].primary   = frame_header->cdef_uv_pri_strength[i];
> +        pp->cdef.uv_strengths[i].secondary = frame_header->cdef_uv_sec_strength[i];
> +    }
> +
> +    /* Misc flags */
> +    pp->interp_filter = frame_header->interpolation_filter;
> +
> +    /* Segmentation */
> +    pp->segmentation.enabled         = frame_header->segmentation_enabled;
> +    pp->segmentation.update_map      = frame_header->segmentation_update_map;
> +    pp->segmentation.update_data     = frame_header->segmentation_update_data;
> +    pp->segmentation.temporal_update = frame_header->segmentation_temporal_update;
> +    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
> +        for (j = 0; j < AV1_SEG_LVL_MAX; j++) {
> +            pp->segmentation.feature_mask[i].mask |= frame_header->feature_enabled[i][j] << j;
> +            pp->segmentation.feature_data[i][j]    = frame_header->feature_value[i][j];
> +        }
> +    }
> +
> +    /* Film grain */
> +    if (frame_header->apply_grain) {
> +        pp->film_grain.apply_grain              = 1;
> +        pp->film_grain.scaling_shift_minus8     = frame_header->grain_scaling_minus_8;
> +        pp->film_grain.chroma_scaling_from_luma = frame_header->chroma_scaling_from_luma;
> +        pp->film_grain.ar_coeff_lag             = frame_header->ar_coeff_lag;
> +        pp->film_grain.ar_coeff_shift_minus6    = frame_header->ar_coeff_shift_minus_6;
> +        pp->film_grain.grain_scale_shift        = frame_header->grain_scale_shift;
> +        pp->film_grain.overlap_flag             = frame_header->overlap_flag;
> +        pp->film_grain.clip_to_restricted_range = frame_header->clip_to_restricted_range;
> +        pp->film_grain.matrix_coeff_is_identity = (seq->color_config.matrix_coefficients == AVCOL_SPC_RGB);
> +
> +        pp->film_grain.grain_seed               = frame_header->grain_seed;
> +        pp->film_grain.num_y_points             = frame_header->num_y_points;
> +        for (i = 0; i < frame_header->num_y_points; i++) {
> +            pp->film_grain.scaling_points_y[i][0] = frame_header->point_y_value[i];
> +            pp->film_grain.scaling_points_y[i][1] = frame_header->point_y_scaling[i];
> +        }
> +        pp->film_grain.num_cb_points            = frame_header->num_cb_points;
> +        for (i = 0; i < frame_header->num_cb_points; i++) {
> +            pp->film_grain.scaling_points_cb[i][0] = frame_header->point_cb_value[i];
> +            pp->film_grain.scaling_points_cb[i][1] = frame_header->point_cb_scaling[i];
> +        }
> +        pp->film_grain.num_cr_points            = frame_header->num_cr_points;
> +        for (i = 0; i < frame_header->num_cr_points; i++) {
> +            pp->film_grain.scaling_points_cr[i][0] = frame_header->point_cr_value[i];
> +            pp->film_grain.scaling_points_cr[i][1] = frame_header->point_cr_scaling[i];
> +        }
> +        for (i = 0; i < 24; i++) {
> +            pp->film_grain.ar_coeffs_y[i] = frame_header->ar_coeffs_y_plus_128[i];
> +        }
> +        for (i = 0; i < 25; i++) {
> +            pp->film_grain.ar_coeffs_cb[i] = frame_header->ar_coeffs_cb_plus_128[i];
> +            pp->film_grain.ar_coeffs_cr[i] = frame_header->ar_coeffs_cr_plus_128[i];
> +        }
> +        pp->film_grain.cb_mult      = frame_header->cb_mult;
> +        pp->film_grain.cb_luma_mult = frame_header->cb_luma_mult;
> +        pp->film_grain.cr_mult      = frame_header->cr_mult;
> +        pp->film_grain.cr_luma_mult = frame_header->cr_luma_mult;
> +        pp->film_grain.cb_offset    = frame_header->cb_offset;
> +        pp->film_grain.cr_offset    = frame_header->cr_offset;
> +        pp->film_grain.cr_offset    = frame_header->cr_offset;
> +    }
> +
> +    // XXX: setting the StatusReportFeedbackNumber breaks decoding on some drivers
> +    // we never use the status reporting functionality, so just skip on that
> +    //pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
> +    return 0;
> +}
> +
> +static int dxva2_av1_start_frame(AVCodecContext *avctx,
> +                                 av_unused const uint8_t *buffer,
> +                                 av_unused uint32_t size)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +
> +    if (!DXVA_CONTEXT_VALID(avctx, ctx))
> +        return -1;
> +    av_assert0(ctx_pic);
> +
> +    /* Fill up DXVA_PicParams_AV1 */
> +    if (fill_picture_parameters(avctx, ctx, h, &ctx_pic->pp) < 0)
> +        return -1;
> +
> +    ctx_pic->bitstream_size = 0;
> +    ctx_pic->bitstream      = NULL;
> +    return 0;
> +}
> +
> +static int dxva2_av1_decode_slice(AVCodecContext *avctx,
> +                                  const uint8_t *buffer,
> +                                  uint32_t size)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    const AV1RawFrameHeader *frame_header = h->raw_frame_header;
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
> +    void *tmp;
> +
> +    ctx_pic->tile_count = frame_header->tile_cols * frame_header->tile_rows;
> +
> +    /* too many tiles, exceeding all defined levels in the AV1 spec */
> +    if (ctx_pic->tile_count > MAX_TILES)
> +        return AVERROR(ENOSYS);
> +
> +    /* Shortcut if all tiles are in the same buffer */
> +    if (ctx_pic->tile_count == h->tg_end - h->tg_start + 1) {
> +        ctx_pic->bitstream = (uint8_t *)buffer;
> +        ctx_pic->bitstream_size = size;
> +
> +        for (uint32_t tile_num = 0; tile_num < ctx_pic->tile_count; tile_num++) {
> +            ctx_pic->tiles[tile_num].DataOffset   = h->tile_group_info[tile_num].tile_offset;
> +            ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
> +            ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
> +            ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
> +            ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
> +        }
> +
> +        return 0;
> +    }
> +
> +    /* allocate an internal buffer */
> +    tmp = av_fast_realloc(ctx->bitstream_cache, &ctx->bitstream_allocated,
> +                          ctx_pic->bitstream_size + size);
> +    if (!tmp) {
> +        return AVERROR(ENOMEM);
> +    }
> +    ctx_pic->bitstream = ctx->bitstream_cache = tmp;
> +
> +    memcpy(ctx_pic->bitstream + ctx_pic->bitstream_size, buffer, size);
> +
> +    for (uint32_t tile_num = h->tg_start; tile_num <= h->tg_end; tile_num++) {
> +        ctx_pic->tiles[tile_num].DataOffset   = ctx_pic->bitstream_size + h->tile_group_info[tile_num].tile_offset;
> +        ctx_pic->tiles[tile_num].DataSize     = h->tile_group_info[tile_num].tile_size;
> +        ctx_pic->tiles[tile_num].row          = h->tile_group_info[tile_num].tile_row;
> +        ctx_pic->tiles[tile_num].column       = h->tile_group_info[tile_num].tile_column;
> +        ctx_pic->tiles[tile_num].anchor_frame = 0xFF;
> +    }
> +
> +    ctx_pic->bitstream_size += size;
> +
> +    return 0;
> +}
> +
> +static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
> +                                             DECODER_BUFFER_DESC *bs,
> +                                             DECODER_BUFFER_DESC *sc)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    void     *dxva_data_ptr;
> +    uint8_t  *dxva_data;
> +    unsigned dxva_size;
> +    unsigned padding;
> +    unsigned type;
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx)) {
> +        type = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
> +        if (FAILED(ID3D11VideoContext_GetDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context,
> +                                                       D3D11VA_CONTEXT(ctx)->decoder,
> +                                                       type,
> +                                                       &dxva_size, &dxva_data_ptr)))
> +            return -1;
> +    }
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
> +        type = DXVA2_BitStreamDateBufferType;
> +        if (FAILED(IDirectXVideoDecoder_GetBuffer(DXVA2_CONTEXT(ctx)->decoder,
> +                                                  type,
> +                                                  &dxva_data_ptr, &dxva_size)))
> +            return -1;
> +    }
> +#endif
> +
> +    dxva_data = dxva_data_ptr;
> +
> +    if (ctx_pic->bitstream_size > dxva_size) {
> +        av_log(avctx, AV_LOG_ERROR, "Bitstream size exceeds hardware buffer");
> +        return -1;
> +    }
> +
> +    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->bitstream_size);

This is a memcpy after a (potential) memcpy in dxva2_av1_decode_slice(). 
Is there no way to avoid it?

> +
> +    padding = FFMIN(128 - ((ctx_pic->bitstream_size) & 127), dxva_size - ctx_pic->bitstream_size);
> +    if (padding > 0) {
> +        memset(dxva_data + ctx_pic->bitstream_size, 0, padding);
> +        ctx_pic->bitstream_size += padding;
> +    }
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx))
> +        if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
> +            return -1;
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD)
> +        if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
> +            return -1;
> +#endif
> +
> +#if CONFIG_D3D11VA
> +    if (ff_dxva2_is_d3d11(avctx)) {
> +        D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
> +        memset(dsc11, 0, sizeof(*dsc11));
> +        dsc11->BufferType           = type;
> +        dsc11->DataSize             = ctx_pic->bitstream_size;
> +        dsc11->NumMBsInBuffer       = 0;
> +
> +        type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
> +    }
> +#endif
> +#if CONFIG_DXVA2
> +    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
> +        DXVA2_DecodeBufferDesc *dsc2 = bs;
> +        memset(dsc2, 0, sizeof(*dsc2));
> +        dsc2->CompressedBufferType = type;
> +        dsc2->DataSize             = ctx_pic->bitstream_size;
> +        dsc2->NumMBsInBuffer       = 0;
> +
> +        type = DXVA2_SliceControlBufferType;
> +    }
> +#endif
> +
> +    return ff_dxva2_commit_buffer(avctx, ctx, sc, type,
> +                                  ctx_pic->tiles, sizeof(*ctx_pic->tiles) * ctx_pic->tile_count, 0);
> +}
> +
> +static int dxva2_av1_end_frame(AVCodecContext *avctx)
> +{
> +    const AV1DecContext *h = avctx->priv_data;
> +    struct av1_dxva2_picture_context *ctx_pic = h->cur_frame.hwaccel_picture_private;
> +    int ret;
> +
> +    if (ctx_pic->bitstream_size <= 0)
> +        return -1;
> +
> +    ret = ff_dxva2_common_end_frame(avctx, h->cur_frame.tf.f,
> +                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
> +                                    NULL, 0,
> +                                    commit_bitstream_and_slice_buffer);
> +
> +    return ret;
> +}
> +
> +static int dxva2_av1_uninit(AVCodecContext *avctx)
> +{
> +    struct AV1DXVAContext *ctx = avctx->internal->hwaccel_priv_data;
> +
> +    av_freep(&ctx->bitstream_cache);
> +    ctx->bitstream_allocated = 0;
> +
> +    return ff_dxva2_decode_uninit(avctx);
> +}
> +
> +#if CONFIG_AV1_DXVA2_HWACCEL
> +const AVHWAccel ff_av1_dxva2_hwaccel = {
> +    .name           = "av1_dxva2",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_DXVA2_VLD,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> +
> +#if CONFIG_AV1_D3D11VA_HWACCEL
> +const AVHWAccel ff_av1_d3d11va_hwaccel = {
> +    .name           = "av1_d3d11va",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_D3D11VA_VLD,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> +
> +#if CONFIG_AV1_D3D11VA2_HWACCEL
> +const AVHWAccel ff_av1_d3d11va2_hwaccel = {
> +    .name           = "av1_d3d11va2",
> +    .type           = AVMEDIA_TYPE_VIDEO,
> +    .id             = AV_CODEC_ID_AV1,
> +    .pix_fmt        = AV_PIX_FMT_D3D11,
> +    .init           = ff_dxva2_decode_init,
> +    .uninit         = dxva2_av1_uninit,
> +    .start_frame    = dxva2_av1_start_frame,
> +    .decode_slice   = dxva2_av1_decode_slice,
> +    .end_frame      = dxva2_av1_end_frame,
> +    .frame_params   = ff_dxva2_common_frame_params,
> +    .frame_priv_data_size = sizeof(struct av1_dxva2_picture_context),
> +    .priv_data_size = sizeof(struct AV1DXVAContext),
> +};
> +#endif
> diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
> index 18e9079c55..9869ce9f72 100644
> --- a/libavcodec/hwaccels.h
> +++ b/libavcodec/hwaccels.h
> @@ -21,6 +21,9 @@
>   
>   #include "avcodec.h"
>   
> +extern const AVHWAccel ff_av1_d3d11va_hwaccel;
> +extern const AVHWAccel ff_av1_d3d11va2_hwaccel;
> +extern const AVHWAccel ff_av1_dxva2_hwaccel;
>   extern const AVHWAccel ff_av1_vaapi_hwaccel;
>   extern const AVHWAccel ff_h263_vaapi_hwaccel;
>   extern const AVHWAccel ff_h263_videotoolbox_hwaccel;
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 5173d0f090..a595e32832 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -29,7 +29,7 @@
>   
>   #define LIBAVCODEC_VERSION_MAJOR  58
>   #define LIBAVCODEC_VERSION_MINOR 112
> -#define LIBAVCODEC_VERSION_MICRO 101
> +#define LIBAVCODEC_VERSION_MICRO 102
>   
>   #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
>                                                  LIBAVCODEC_VERSION_MINOR, \
>