[FFmpeg-devel] [PATCH]levc/hevc_cabac Optimise ff_hevc_hls_residual_coding (v2)

Fri Jan 22 18:52:23 CET 2016

Hi,

2016-01-21 11:45 GMT+01:00 John Cox <jc at kynesim.co.uk>:
> Hi
>
> v2 of my hevc residual patch

I'll review the bit not related to significant coeffs first, because I
think it is the most performance-sensitive. Also there are bits that
could be moved to other patches, at least some are related to the
later bypass patch. Here's a list you'll see detailed below:
- coefficient saturation, which I think is OK to commit
- bypass-related stuff
- boolean stuff (!!stuff), which I think is OK to commit
- cosmetics (like renaming a variable or introducing a shorthand)
- sig(nificant coefficients )map

The fact is I've benchmarked parts of the code and seeing slowdowns as
well as speedups on x86_64, hence why it would be nice to be able to
test and evaluate each of those parts separately.

> +// Helper fns
> +#ifndef hevc_mem_bits32
> +static av_always_inline uint32_t hevc_mem_bits32(const void * buf,
> const unsigned int offset)
> +{
> +    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
> +}
> +#endif
> +
> +#if !defined(hevc_clz32)
> +#define hevc_clz32 hevc_clz32_builtin
> +static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
> +{
> +    // ff_clz says works on ints (probably) - so adjust if int is >32 bits long
> +    // the fact that x is passed in as uint32_t will have cleared the top bits
> +    return ff_clz(x) - (sizeof(int) * 8 - 32);
> +}
> +#endif
> +
> +#define bypass_start(s)
> +#define bypass_finish(s)

bypass-related?

>  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
>  {
>      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
> @@ -863,19 +928,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int
> trafo_depth)
>      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
>  }
>
> -static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
> +static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
>  {
> -    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
> +    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
>  }
>
> -static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
> +static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
>  {
> -    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
> +    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
>  }
>
> -static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
> +static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
>  {
> -    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
> +    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
>  }

Boolean stuff. Ideally, the whole boolean stuff topic would be better
as a separate patch, with which I would be OK.

>  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
> @@ -891,14 +956,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
>      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
>  }
>
> -static av_always_inline void
> last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
> +static av_always_inline void
> last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
>                                                     int log2_size, int
> *last_scx_prefix, int *last_scy_prefix)
>  {
>      int i = 0;
>      int max = (log2_size << 1) - 1;
>      int ctx_offset, ctx_shift;
>
> -    if (!c_idx) {
> +    if (!c_idx_nz) {
>          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
>          ctx_shift = (log2_size + 1) >> 2;
>      } else {
> @@ -929,22 +994,16 @@ static av_always_inline int
> last_significant_coeff_suffix_decode(HEVCContext *s,
>      return value;
>  }
>
> -static av_always_inline int
> significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int
> ctx_cg)
> +static av_always_inline int
> significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int
> ctx_cg)

cosmetics?

>  {
>      int inc;
>
> -    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
> +    inc = (ctx_cg != 0) + (c_idx_nz << 1);
>
>      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
>  }

boolean stuff

> -static av_always_inline int significant_coeff_flag_decode(HEVCContext
> *s, int x_c, int y_c,
> -                                           int offset, const uint8_t
> *ctx_idx_map)
> -{
> -    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
> -    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
> -}

sigmap

> -static av_always_inline int
> significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
> +static av_always_inline int
> significant_coeff_flag_decode_0(HEVCContext *s, int offset)
>  {
>      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
>  }
> @@ -966,65 +1025,305 @@ static av_always_inline int
> coeff_abs_level_greater2_flag_decode(HEVCContext *s,
>      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
>  }
>
> -static av_always_inline int
> coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
> +
> +#if !USE_BY22
> +#define coeff_abs_level_remaining_decode_bypass(s,r)
> coeff_abs_level_remaining_decode(s, r)
> +#endif
> +
> +
> +#ifndef coeff_abs_level_remaining_decode_bypass
> +static int coeff_abs_level_remaining_decode_bypass(HEVCContext *
> const s, const unsigned int rice_param)
>  {
> +    CABACContext * const c = &s->HEVClc->cc;
> +    uint32_t y;
> +    unsigned int prefix;
> +    unsigned int last_coeff_abs_level_remaining;
> +    unsigned int n;
> +
> +    y = get_cabac_by22_peek(c);
> +    prefix = hevc_clz32(~y);
> +    // y << prefix will always have top bit 0
> +
> +    if (prefix < 3) {
> +        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
> +        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
> +        n = prefix + 1 + rice_param;
> +    }
> +    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
> +    {
> +        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 -
> (prefix + rice_param));
> +
> +        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
> +        n = prefix * 2 + rice_param - 2;
> +    }
> +    else {
> +        unsigned int suffix;
> +
> +        get_cabac_by22_flush(c, prefix, y);
> +        y = get_cabac_by22_peek(c);
> +
> +        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
> +        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
> +        n = prefix + rice_param - 2;
> +    }
> +
> +    get_cabac_by22_flush(c, n, y);
> +
> +    return last_coeff_abs_level_remaining;
> +}
> +#endif

I think USE_BY22 is not yet defined?
Related to bypass.

> +
> +static int coeff_abs_level_remaining_decode(HEVCContext * const s,
> int rc_rice_param)
> +{
> +    CABACContext * const c = &s->HEVClc->cc;
>      int prefix = 0;
>      int suffix = 0;
>      int last_coeff_abs_level_remaining;
>      int i;
>
> -    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
> +    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
>          prefix++;
>      if (prefix == CABAC_MAX_BIN) {
>          av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
>          return 0;
>      }
> +
>      if (prefix < 3) {
>          for (i = 0; i < rc_rice_param; i++)
> -            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
> +            suffix = (suffix << 1) | get_cabac_bypass(c);
>          last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
>      } else {
>          int prefix_minus3 = prefix - 3;
>          for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
> -            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
> +            suffix = (suffix << 1) | get_cabac_bypass(c);
>          last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
>                                                << rc_rice_param) + suffix;
>      }
> +
>      return last_coeff_abs_level_remaining;
>  }

Cosmetics.

>
> -static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
> +#if !USE_BY22
> +#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
> +static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s,
> const unsigned int nb)
>  {
> -    int i;
> -    int ret = 0;
> +    CABACContext * const c = &s->HEVClc->cc;
> +    unsigned int i;
> +    uint32_t ret = 0;
>
>      for (i = 0; i < nb; i++)
> -        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
> -    return ret;
> +        ret = (ret << 1) | get_cabac_bypass(c);
> +
> +    return ret << (32 - nb);
> +}
> +#endif
> +
> +#ifndef coeff_sign_flag_decode_bypass
> +static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext *
> const s, const unsigned int nb)
> +{
> +    CABACContext * const c = &s->HEVClc->cc;
> +    uint32_t y;
> +    y = get_cabac_by22_peek(c);
> +    get_cabac_by22_flush(c, nb, y);
> +    return y & ~(0xffffffffU >> nb);
> +}
> +#endif

bypass

> +#ifndef get_cabac_greater1_bits
> +static inline unsigned int get_cabac_greater1_bits(CABACContext *
> const c, const unsigned int n,
> +    uint8_t * const state0)
> +{
> +    unsigned int i;
> +    unsigned int rv = 0;
> +    for (i = 0; i != n; ++i) {
> +        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
> +        const unsigned int b = get_cabac(c, state0 + idx);
> +        rv = (rv << 1) | b;
> +    }
> +    return rv;
> +}
> +#endif

I suppose branch prediction could help here, but less likely than for
get_cabac_sig_coeff_flag_idxs.

Also for this and some others: why inline over av_always_inline?

> +// N.B. levels returned are the values assuming coeff_abs_level_remaining
> +// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
> +// this version of events.
> +static inline uint32_t get_greaterx_bits(HEVCContext * const s, const
> unsigned int n_end, int * const levels,
> +    int * const pprev_subset_coded, int * const psum,
> +    const unsigned int idx0_gt1, const unsigned int idx_gt2)
> +{
> +    CABACContext * const c = &s->HEVClc->cc;
> +    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
> +    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
> +    unsigned int rv;
> +    unsigned int i;
> +    const unsigned int n = FFMIN(n_end, 8);
> +
> +    // Really this is i != n but the simple unconditional loop is cheaper
> +    // and faster
> +    for (i = 0; i != 8; ++i)
> +        levels[i] = 1;

AV_WN64 of 0x0101010101010101ULL, or even a memset, as it would be
inlined by the compiler, given its size, and done the best possible
way.
Also the function is related to but not strictly sigmap. Should be ok
to be in the same patch, though.

> +#ifndef trans_scale_sat
> +static inline int trans_scale_sat(const int level, const unsigned int
> scale, const unsigned int scale_m, const unsigned int shift)
> +{
> +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift)
> + 1) >> 1);
> +}
> +#endif

Saturation, could be a separate patch, with which I would be ok.

> +#ifndef update_rice
> +static inline void update_rice(uint8_t * const stat_coeff,
> +                              const unsigned int
> last_coeff_abs_level_remaining,
> +                              const unsigned int c_rice_param)
> +{
> +    const unsigned int x = (last_coeff_abs_level_remaining << 1) >>
> c_rice_param;
> +    if (x >= 6)
> +        (*stat_coeff)++;
> +    else if (x == 0 && *stat_coeff > 0)
> +        (*stat_coeff)--;
> +}
> +#endif

Related to but not strictly bypass ?

> +// n must be > 0 on entry
> +#ifndef get_cabac_sig_coeff_flag_idxs
> +static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext *
> const c, uint8_t * const state0,
> +                                                     unsigned int n,
> +                                                     const uint8_t
> const * ctx_map,
> +                                                     uint8_t * p)
> +{
> +    do {
> +        if (get_cabac(c, state0 + ctx_map[n]))
> +            *p++ = n;
> +    } while (--n != 0);
> +    return p;
> +}
> +#endif

Doing:
    if (get_cabac(c, state0 + ctx_map[n]))
        *p++ = n;
    while (--n != 0) {
        if (get_cabac(c, state0 + ctx_map[n]))
            *p++ = n;
    }
is most likely faster, probably also on arm, if the branch prediction is good.

For this one as well as the 3 following I left, I think
av_always_inline is more important than some other functions that
instead should be inline (eg left to the compiler to decide).

> +
> +
> +static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t *
> const state0,
> +    unsigned int n,
> +    const uint8_t const * ctx_map,
> +    uint8_t * const flag_idx)
> +{
> +    int rv;
> +
> +    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map,
> flag_idx) - flag_idx;
> +
> +    return rv;
>  }

The x86 h264 implementation does it slightly differently but it's hard
to say which would be the best.
Let's say arm is more in need of the best performance here.

> +#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
> x13, x14, x15) {\
> +     x0,  x1,  x2,  x3,\
> +     x4,  x5,  x6,  x7,\
> +     x8,  x9, x10, x11,\
> +    x12, x13, x14, x15}
> +
> +#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
> x13, x14, x15) {\
> +     x0,  x4,  x8, x12,\
> +     x1,  x5,  x9, x13,\
> +     x2,  x6, x10, x14,\
> +     x3,  x7, x11, x15}
> +
> +#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12,
> x13, x14, x15) {\
> +     x0,  x4,  x1,  x8,\
> +     x5,  x2, x12,  x9,\
> +     x6,  x3, x13, x10,\
> +     x7, x14, x11, x15}

sigmap

So, this allows reducing the indirection (lut1[lut2[]]), which is useful.

I would have done it the same way, but it's sufficiently important
that it'd be nice to have someone else's opinion on how to do it.

> +static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
> +                             uint8_t * const significant_coeff_group_flag,
> +                             const uint8_t * const scan_x_cg, const
> uint8_t * const scan_y_cg,
> +                             int * const pPrev_sig)
> +{
> +    while (--i >= 0) {
> +        unsigned int x_cg = scan_x_cg[i];
> +        unsigned int y_cg = scan_y_cg[i];
> +
> +        // For the flag decode we only care about Z/NZ but
> +        // we use the full Right + Down * 2 when calculating
> +        // significant coeff flags so we obtain it here.
> +        //
> +        // The group flag array is one longer than it needs to
> +        // be so we don't need to check for y_cg limits
> +        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg]
>>> (x_cg + 1)) & 1) |
> +            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
> +
> +        if (i == 0 ||
> +            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
> +        {
> +            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
> +            *pPrev_sig = prev_sig;
> +            break;
> +        }
> +    }
> +
> +    return i;
> +}

Not strictly sigmap, but related to. Ok to be in the same patch or another.

> +    const int c_idx_nz = (c_idx != 0);

boolean? (not sure)

> +
> +    int may_hide_sign;
> +

This and related are a slightly separate topic from the other stuff,
but ok within any of the patches.

> -        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
> -        add      = 1 << (shift-1);
> -        scale    = level_scale[rem6[qp]] << (div6[qp]);
> -        scale_m  = 16; // default when no custom scaling lists.
> -        dc_scale = 16;
> +        // Shift is set to one less than will actually occur as the scale
> +        // and saturate step adds 1 and then shifts right again
> +        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
> +        scale = level_scale[rem6[qp]];
> +        if (div6[qp] >= shift) {
> +            scale <<= (div6[qp] - shift);
> +            shift = 0;
> +        } else {
> +            shift -= div6[qp];
> +        }

Saturation?

> +            dc_scale = scale_matrix[0];
>              if (log2_trafo_size >= 4)
>                  dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
>          }
> +        else
> +        {
> +            static const uint8_t sixteen_scale[64] = {
> +                16, 16, 16, 16, 16, 16, 16, 16,
> +                16, 16, 16, 16, 16, 16, 16, 16,
[...]
> +            1, 1, 1, 1, 1, 1, 1, 1,
> +            1, 1, 1, 1, 1, 1, 1, 1,
> +            1, 1, 1, 1, 1, 1, 1, 1,
> +        };
> +        scale_matrix = unit_scale;
>          shift        = 0;
> -        add          = 0;
> -        scale        = 0;
> -        dc_scale     = 0;
> +        scale        = 2;  // We will shift right to kill this
> +        dc_scale     = 1;

Saturation?

> +#if USE_N_END_1
> +            if (nb_significant_coeff_flag == 1) {
> +                // There is a small gain to be had from special

So we decided to have #define USE_N_END_1 ARCH_ARM. As said in another
mail, there's a 10-20% loss for the codeblock.

USE_BY22_DIV also strangely provides no benefit.

Best regards,
-- 
Christophe