[FFmpeg-devel] [PATCH] lavc/vaapi: Add VP8 decode hwaccel

Fri Nov 11 09:57:13 EET 2016

From 4635e7e4a0ea24f77e71ffc9a9074e75c61bfe44 Mon Sep 17 00:00:00 2001
From: Jun Zhao <jun.zhao at intel.com>
Date: Fri, 11 Nov 2016 15:51:01 +0800
Subject: [PATCH] lavc/vaapi: Add VP8 decode hwaccel

Add VP8 decode hwaccel based on the libav:
commit a9fb134730da1f9642eb5a2baa50943b8a4aa245
    lavc/vaapi: Add VP8 decode hwaccel
commit 75d642a944d5579e4ef20ff3701422a64692afcf
    vaapi_vp8: Explicitly include libva vp8 decode header

Reviewed-by: Jun Zhao <jun.zhao at intel.com>
Signed-off-by: Wang, Yi A <yi.a.wang at intel.com>

ase enter the commit message for your changes. Lines starting
---
 configure                   |   3 +
 libavcodec/Makefile         |   1 +
 libavcodec/allcodecs.c      |   1 +
 libavcodec/vaapi.c          |  15 ++++-
 libavcodec/vaapi.h          |   9 +++
 libavcodec/vaapi_internal.h |   3 +
 libavcodec/vp8.c            | 149 ++++++++++++++++++++++++++++++--------------
 libavcodec/vp8.h            |  29 ++++++++-
 8 files changed, 159 insertions(+), 51 deletions(-)

diff --git a/configure b/configure
index 87b06f1..7b6a1c4 100755
--- a/configure
+++ b/configure
@@ -2668,6 +2668,8 @@ vp8_cuvid_hwaccel_deps="cuda cuvid"
 vp9_cuvid_hwaccel_deps="cuda cuvid"
 vp8_mediacodec_decoder_deps="mediacodec"
 vp8_mediacodec_hwaccel_deps="mediacodec"
+vp8_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferVP8"
+vp8_vaapi_hwaccel_select="vp8_decoder"
 vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
 vp9_d3d11va_hwaccel_select="vp9_decoder"
 vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
@@ -5588,6 +5590,7 @@ check_type "va/va.h va/va_vpp.h" "VAProcPipelineParameterBuffer"
 check_type "va/va.h va/va_enc_h264.h" "VAEncPictureParameterBufferH264"
 check_type "va/va.h va/va_enc_hevc.h" "VAEncPictureParameterBufferHEVC"
 check_type "va/va.h va/va_enc_jpeg.h" "VAEncPictureParameterBufferJPEG"
+check_type "va/va.h va/va_dec_vp8.h" "VAPictureParameterBufferVP8"
 
 check_type "vdpau/vdpau.h" "VdpPictureInfoHEVC"
 
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 5fdc97f..502872c 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -804,6 +804,7 @@ OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
 OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
 OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
 OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
+OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
 
 # libavformat dependencies
 OBJS-$(CONFIG_ISO_MEDIA)               += mpeg4audio.o mpegaudiodata.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index ada9481..64a7ccd 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -125,6 +125,7 @@ void avcodec_register_all(void)
     REGISTER_HWACCEL(WMV3_DXVA2,        wmv3_dxva2);
     REGISTER_HWACCEL(WMV3_VAAPI,        wmv3_vaapi);
     REGISTER_HWACCEL(WMV3_VDPAU,        wmv3_vdpau);
+    REGISTER_HWACCEL(VP8_VAAPI,         vp8_vaapi);
 
     /* video codecs */
     REGISTER_ENCODER(A64MULTI,          a64multi);
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
index 36db640..e440e8f 100644
--- a/libavcodec/vaapi.c
+++ b/libavcodec/vaapi.c
@@ -59,6 +59,7 @@ int ff_vaapi_context_init(AVCodecContext *avctx)
     vactx->pic_param_buf_id     = VA_INVALID_ID;
     vactx->iq_matrix_buf_id     = VA_INVALID_ID;
     vactx->bitplane_buf_id      = VA_INVALID_ID;
+    vactx->prob_buf_id          = VA_INVALID_ID;
 
     return 0;
 }
@@ -70,7 +71,7 @@ int ff_vaapi_context_fini(AVCodecContext *avctx)
 
 int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface)
 {
-    VABufferID va_buffers[3];
+    VABufferID va_buffers[4];
     unsigned int n_va_buffers = 0;
 
     if (vactx->pic_param_buf_id == VA_INVALID_ID)
@@ -89,6 +90,11 @@ int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface)
         va_buffers[n_va_buffers++] = vactx->bitplane_buf_id;
     }
 
+    if (vactx->prob_buf_id != VA_INVALID_ID) {
+        vaUnmapBuffer(vactx->display, vactx->prob_buf_id);
+        va_buffers[n_va_buffers++] = vactx->prob_buf_id;
+    }
+
     if (vaBeginPicture(vactx->display, vactx->context_id,
                        surface) != VA_STATUS_SUCCESS)
         return -1;
@@ -175,6 +181,11 @@ uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size)
     return alloc_buffer(vactx, VABitPlaneBufferType, size, &vactx->bitplane_buf_id);
 }
 
+uint8_t *ff_vaapi_alloc_prob_buffer(FFVAContext *vactx, uint32_t size)
+{
+    return alloc_buffer(vactx, VAProbabilityBufferType, size, &vactx->prob_buf_id);
+}
+
 VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8_t *buffer, uint32_t size)
 {
     uint8_t *slice_params;
@@ -209,10 +220,10 @@ VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8
 void ff_vaapi_common_end_frame(AVCodecContext *avctx)
 {
     FFVAContext * const vactx = ff_vaapi_get_context(avctx);
-
     destroy_buffers(vactx->display, &vactx->pic_param_buf_id, 1);
     destroy_buffers(vactx->display, &vactx->iq_matrix_buf_id, 1);
     destroy_buffers(vactx->display, &vactx->bitplane_buf_id, 1);
+    destroy_buffers(vactx->display, &vactx->prob_buf_id, 1);
     destroy_buffers(vactx->display, vactx->slice_buf_ids, vactx->n_slice_buf_ids);
     av_freep(&vactx->slice_buf_ids);
     av_freep(&vactx->slice_params);
diff --git a/libavcodec/vaapi.h b/libavcodec/vaapi.h
index 7a29f6f..4c86538 100644
--- a/libavcodec/vaapi.h
+++ b/libavcodec/vaapi.h
@@ -103,6 +103,15 @@ struct vaapi_context {
     uint32_t bitplane_buf_id;
 
     /**
+     * VAProbabilityBuffer ID (for VP-8 decoding)
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t prob_buf_id;
+
+    /**
      * Slice parameter/data buffer IDs
      *
      * - encoding: unused
diff --git a/libavcodec/vaapi_internal.h b/libavcodec/vaapi_internal.h
index 306ae13..76cc6b0 100644
--- a/libavcodec/vaapi_internal.h
+++ b/libavcodec/vaapi_internal.h
@@ -42,6 +42,7 @@ typedef struct {
     VABufferID pic_param_buf_id;        ///< Picture parameter buffer
     VABufferID iq_matrix_buf_id;        ///< Inverse quantiser matrix buffer
     VABufferID bitplane_buf_id;         ///< Bitplane buffer (for VC-1 decoding)
+    VABufferID prob_buf_id;             ///< Prob buffer (for VP-8 decoding)
     VABufferID *slice_buf_ids;          ///< Slice parameter/data buffers
     unsigned int n_slice_buf_ids;       ///< Number of effective slice buffers
     unsigned int slice_buf_ids_alloc;   ///< Number of allocated slice buffers
@@ -83,6 +84,8 @@ void *ff_vaapi_alloc_iq_matrix(FFVAContext *vactx, unsigned int size);
 /** Allocate a new bit-plane buffer */
 uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size);
 
+/** Allocate a new prob buffer */
+uint8_t *ff_vaapi_alloc_prob_buffer(FFVAContext *vactx, uint32_t size);
 /**
  * Allocate a new slice descriptor for the input slice.
  *
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index c1c3eb7..fe53806 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -140,7 +140,7 @@ static VP8Frame *vp8_find_free_buffer(VP8Context *s)
         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
         abort();
     }
-    if (frame->tf.f->data[0])
+    if (frame->tf.f->buf[0])
         vp8_release_frame(s, frame);
 
     return frame;
@@ -218,8 +218,9 @@ static void parse_segment_info(VP8Context *s)
     int i;
 
     s->segmentation.update_map = vp8_rac_get(c);
+    s->segmentation.update_feature_data = vp8_rac_get(c);
 
-    if (vp8_rac_get(c)) { // update segment feature data
+    if (s->segmentation.update_feature_data) { // update segment feature data
         s->segmentation.absolute_vals = vp8_rac_get(c);
 
         for (i = 0; i < 4; i++)
@@ -273,11 +274,14 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
         int size = AV_RL24(sizes + 3 * i);
         if (buf_size - size < 0)
             return -1;
+        s->coeff_partition_size[i] = size;
 
         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
         buf      += size;
         buf_size -= size;
     }
+
+    s->coeff_partition_size[i] = buf_size;
     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 
     return 0;
@@ -333,6 +337,12 @@ static void vp8_get_quants(VP8Context *s)
         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
     }
+    s->quant.yac_qi = yac_qi;
+    s->quant.ydc_delta = ydc_delta;
+    s->quant.y2dc_delta = y2dc_delta;
+    s->quant.y2ac_delta = y2ac_delta;
+    s->quant.uvdc_delta = uvdc_delta;
+    s->quant.uvac_delta = uvac_delta;
 }
 
 /**
@@ -656,6 +666,7 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     buf      += 3;
     buf_size -= 3;
 
+    s->header_partition_size = header_size;
     if (s->profile > 3)
         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 
@@ -719,9 +730,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->filter.level     = vp8_rac_get_uint(c, 6);
     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 
-    if ((s->lf_delta.enabled = vp8_rac_get(c)))
-        if (vp8_rac_get(c))
+    if ((s->lf_delta.enabled = vp8_rac_get(c))) {
+        s->lf_delta.update = vp8_rac_get(c);
+        if (s->lf_delta.update)
             update_lf_deltas(s);
+    }
 
     if (setup_partitions(s, buf, buf_size)) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
@@ -761,6 +774,12 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
     }
 
+    s->c.code_word = vp56_rac_renorm(&s->c);
+    s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
+    s->coder_state_at_header_end.range     = s->c.high;
+    s->coder_state_at_header_end.value     = s->c.code_word >> 16;
+    s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
+
     return 0;
 }
 
@@ -2538,6 +2557,22 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if (ret < 0)
         goto err;
+    if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
+        enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_VP8_VAAPI_HWACCEL
+            AV_PIX_FMT_VAAPI,
+#endif
+            AV_PIX_FMT_YUV420P,
+            AV_PIX_FMT_NONE,
+        };
+
+        s->pix_fmt = ff_get_format(s->avctx, pix_fmts);
+        if (s->pix_fmt < 0) {
+            ret = AVERROR(EINVAL);
+            goto err;
+        }
+        avctx->pix_fmt = s->pix_fmt;
+    }
 
     prev_frame = s->framep[VP56_FRAME_CURRENT];
 
@@ -2557,7 +2592,7 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // release no longer referenced frames
     for (i = 0; i < 5; i++)
-        if (s->frames[i].tf.f->data[0] &&
+        if (s->frames[i].tf.f->buf[0] &&
             &s->frames[i] != prev_frame &&
             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
@@ -2613,52 +2648,65 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (avctx->codec->update_thread_context)
         ff_thread_finish_setup(avctx);
 
-    s->linesize   = curframe->tf.f->linesize[0];
-    s->uvlinesize = curframe->tf.f->linesize[1];
-
-    memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
-    /* Zero macroblock structures for top/top-left prediction
-     * from outside the frame. */
-    if (!s->mb_layout)
-        memset(s->macroblocks + s->mb_height * 2 - 1, 0,
-               (s->mb_width + 1) * sizeof(*s->macroblocks));
-    if (!s->mb_layout && s->keyframe)
-        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+    if (avctx->hwaccel) {
+        ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
+        ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
+        ret = avctx->hwaccel->end_frame(avctx);
+        if (ret < 0) {
+            av_log(NULL, AV_LOG_ERROR, "end_frame error");
+            goto err;
+        }
+    } else {
+        s->linesize   = curframe->tf.f->linesize[0];
+        s->uvlinesize = curframe->tf.f->linesize[1];
 
-    memset(s->ref_count, 0, sizeof(s->ref_count));
+        memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
+        /* Zero macroblock structures for top/top-left prediction
+         * from outside the frame. */
+        if (!s->mb_layout)
+            memset(s->macroblocks + s->mb_height * 2 - 1, 0,
+                (s->mb_width + 1) * sizeof(*s->macroblocks));
+        if (!s->mb_layout && s->keyframe)
+            memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+
+        memset(s->ref_count, 0, sizeof(s->ref_count));
+
+        if (s->mb_layout == 1) {
+            // Make sure the previous frame has read its segmentation map,
+            // if we re-use the same map.
+            if (prev_frame && s->segmentation.enabled &&
+                !s->segmentation.update_map)
+                ff_thread_await_progress(&prev_frame->tf, 1, 0);
+            if (is_vp7)
+                vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            else
+                vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+        }
 
-    if (s->mb_layout == 1) {
-        // Make sure the previous frame has read its segmentation map,
-        // if we re-use the same map.
-        if (prev_frame && s->segmentation.enabled &&
-            !s->segmentation.update_map)
-            ff_thread_await_progress(&prev_frame->tf, 1, 0);
-        if (is_vp7)
-            vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+        if (avctx->active_thread_type == FF_THREAD_FRAME)
+            num_jobs = 1;
         else
-            vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
-    }
-
-    if (avctx->active_thread_type == FF_THREAD_FRAME)
-        num_jobs = 1;
-    else
-        num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
-    s->num_jobs   = num_jobs;
-    s->curframe   = curframe;
-    s->prev_frame = prev_frame;
-    s->mv_min.y   = -MARGIN;
-    s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
-    for (i = 0; i < MAX_THREADS; i++) {
-        s->thread_data[i].thread_mb_pos = 0;
-        s->thread_data[i].wait_mb_pos   = INT_MAX;
-    }
-    if (is_vp7)
-        avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
+            num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
+        s->num_jobs   = num_jobs;
+        s->curframe   = curframe;
+        s->prev_frame = prev_frame;
+        s->mv_min.y   = -MARGIN;
+        s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
+        for (i = 0; i < MAX_THREADS; i++) {
+            s->thread_data[i].thread_mb_pos = 0;
+            s->thread_data[i].wait_mb_pos   = INT_MAX;
+        }
+        if (is_vp7)
+            avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
                         num_jobs);
-    else
-        avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
+        else
+            avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
                         num_jobs);
-
+    }
     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
 
@@ -2727,6 +2775,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     int ret;
 
     s->avctx = avctx;
+    s->pix_fmt = AV_PIX_FMT_NONE;
     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
@@ -2800,14 +2849,18 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
         s->mb_width  = s_src->mb_width;
         s->mb_height = s_src->mb_height;
     }
-
     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
     s->segmentation = s_src->segmentation;
     s->lf_delta     = s_src->lf_delta;
+    s->pix_fmt = s_src->pix_fmt;
+    s->mbskip_enabled = s_src->mbskip_enabled;
+    s->filter = s_src->filter;
     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
+    s->num_coeff_partitions = s_src->num_coeff_partitions;
+    s->header_partition_size = s_src->header_partition_size;
 
     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
-        if (s_src->frames[i].tf.f->data[0]) {
+        if (s_src->frames[i].tf.f->buf[0]) {
             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
             if (ret < 0)
                 return ret;
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 374e138..af7285f 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -136,6 +136,8 @@ typedef struct VP8intmv {
 typedef struct VP8Context {
     VP8ThreadData *thread_data;
     AVCodecContext *avctx;
+    enum AVPixelFormat pix_fmt;
+
     VP8Frame *framep[4];
     VP8Frame *next_framep[4];
     VP8Frame *curframe;
@@ -165,6 +167,7 @@ typedef struct VP8Context {
         uint8_t enabled;
         uint8_t absolute_vals;
         uint8_t update_map;
+        uint8_t update_feature_data;
         int8_t base_quant[4];
         int8_t filter_level[4];     ///< base loop filter level
     } segmentation;
@@ -192,9 +195,19 @@ typedef struct VP8Context {
         int16_t chroma_qmul[2];
     } qmat[4];
 
+    // Raw quantisation values, which may be needed by hwaccel decode.
     struct {
-        uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
+        int yac_qi;
+        int ydc_delta;
+        int y2dc_delta;
+        int y2ac_delta;
+        int uvdc_delta;
+        int uvac_delta;
+    } quant;
 
+    struct {
+        uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
+        uint8_t update;
         /**
          * filter strength adjustment for the following macroblock modes:
          * [0-3] - i16x16 (always zero)
@@ -221,6 +234,19 @@ typedef struct VP8Context {
 
     VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
 
+    /* This contains the entropy coder state at the end of the header
+     * block, in the form specified by the standard.  For use by
+     * hwaccels, so that a hardware decoder has the information to
+     * start decoding at the macroblock layer.
+    */
+    struct {
+        const uint8_t *input;
+        uint32_t range;
+        uint32_t value;
+        int bit_count;
+    } coder_state_at_header_end;
+
+    int header_partition_size;
     /**
      * These are all of the updatable probabilities for binary decisions.
      * They are only implicitly reset on keyframes, making it quite likely
@@ -258,6 +284,7 @@ typedef struct VP8Context {
      */
     int num_coeff_partitions;
     VP56RangeCoder coeff_partition[8];
+    int coeff_partition_size[8];
     VideoDSPContext vdsp;
     VP8DSPContext vp8dsp;
     H264PredContext hpc;
-- 
1.8.3.1