[FFmpeg-devel] [PATCH 1/2] apv_decode: Multisymbol entropy decode
Mark Thompson
sw at jkqxz.net
Mon May 5 23:21:27 EEST 2025
---
Decode up to four symbols per step with larger lookup tables. This is highly finicky because a lot of internal state has to be tracked and it therefore doesn't fit at all into the existing multisymbol VLC strcture.
On a big core (Alder Lake P core) this makes the whole decoder 30-90% faster (largest gains at high bitrate). On a smaller core (Alder Lake E core) the gain is less, but is still there. There may be some better tuning for the smaller core here (make tables smaller by skipping lookup in some cases?) but I have not pursued it.
Thanks,
- Mark
libavcodec/apv_decode.c | 5 +-
libavcodec/apv_decode.h | 59 ++++--
libavcodec/apv_entropy.c | 442 ++++++++++++++++++++++++++++-----------
3 files changed, 363 insertions(+), 143 deletions(-)
diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c
index e15c125b58..eb47298e2e 100644
--- a/libavcodec/apv_decode.c
+++ b/libavcodec/apv_decode.c
@@ -160,6 +160,7 @@ static int apv_decode_block(AVCodecContext *avctx,
int err;
LOCAL_ALIGNED_32(int16_t, coeff, [64]);
+ memset(coeff, 0, 64 * sizeof(int16_t));
err = ff_apv_entropy_decode_block(coeff, gbc, entropy_state);
if (err < 0)
@@ -216,8 +217,8 @@ static int apv_decode_tile_component(AVCodecContext *avctx, void *data,
.log_ctx = avctx,
.decode_lut = &decode_lut,
.prev_dc = 0,
- .prev_dc_diff = 20,
- .prev_1st_ac_level = 0,
+ .prev_k_dc = 5,
+ .prev_k_level = 0,
};
int err;
diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h
index 34c6176ea0..d94802e208 100644
--- a/libavcodec/apv_decode.h
+++ b/libavcodec/apv_decode.h
@@ -33,14 +33,39 @@
#define APV_VLC_LUT_BITS 9
#define APV_VLC_LUT_SIZE (1 << APV_VLC_LUT_BITS)
-typedef struct APVVLCLUTEntry {
+typedef struct APVSingleVLCLUTEntry {
uint16_t result; // Return value if not reading more.
uint8_t consume; // Number of bits to consume.
uint8_t more; // Whether to read additional bits.
-} APVVLCLUTEntry;
+} APVSingleVLCLUTEntry;
+
+typedef struct APVMultiVLCLUTEntry {
+ // Number of symbols this bit stream resolves to.
+ uint8_t count;
+ // k_run after decoding all symbols.
+ uint8_t k_run : 2;
+ // k_level after decoding the first level symbol.
+ uint8_t k_level_0 : 3;
+ // k_level after decoding the all symbols.
+ uint8_t k_level_1 : 3;
+ // Run output values.
+ uint8_t run[2];
+ // Level output values.
+ int16_t level[2];
+ // Bit index of the end of each code.
+ uint8_t offset[4];
+} APVMultiVLCLUTEntry;
typedef struct APVVLCLUT {
- APVVLCLUTEntry lut[6][APV_VLC_LUT_SIZE];
+ // Single-symbol LUT for VLCs.
+ // Applies to all coefficients, but used only for DC coefficients
+ // in the decoder.
+ APVSingleVLCLUTEntry single_lut[6][APV_VLC_LUT_SIZE];
+ // Multi-symbol LUT for run/level combinations, decoding up to four
+ // symbols per step. Comes in two versions, which to use depends on
+ // whether the next symbol is a run or a level.
+ APVMultiVLCLUTEntry run_first_lut[3][5][APV_VLC_LUT_SIZE];
+ APVMultiVLCLUTEntry level_first_lut[3][5][APV_VLC_LUT_SIZE];
} APVVLCLUT;
typedef struct APVEntropyState {
@@ -48,33 +73,29 @@ typedef struct APVEntropyState {
const APVVLCLUT *decode_lut;
+ // Previous DC level value.
int16_t prev_dc;
- int16_t prev_dc_diff;
- int16_t prev_1st_ac_level;
+ // k parameter implied by the previous DC level value.
+ uint8_t prev_k_dc;
+ // k parameter implied by the previous first AC level value.
+ uint8_t prev_k_level;
} APVEntropyState;
/**
- * Build the decoder VLC look-up table.
+ * Build the decoder VLC look-up tables.
*/
void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut);
/**
* Entropy decode a single 8x8 block to coefficients.
*
- * Outputs in block order (dezigzag already applied).
+ * Outputs nonzero coefficients only to the block row-major order
+ * (dezigzag is applied within the function). The output block
+ * must have been filled with zeroes before calling this function.
*/
-int ff_apv_entropy_decode_block(int16_t *coeff,
- GetBitContext *gbc,
- APVEntropyState *state);
-
-/**
- * Read a single APV VLC code.
- *
- * This entrypoint is exposed for testing.
- */
-unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param,
- const APVVLCLUT *lut);
-
+int ff_apv_entropy_decode_block(int16_t *restrict coeff,
+ GetBitContext *restrict gbc,
+ APVEntropyState *restrict state);
#endif /* AVCODEC_APV_DECODE_H */
diff --git a/libavcodec/apv_entropy.c b/libavcodec/apv_entropy.c
index a5648c09b4..49b568c094 100644
--- a/libavcodec/apv_entropy.c
+++ b/libavcodec/apv_entropy.c
@@ -19,15 +19,55 @@
#include "apv.h"
#include "apv_decode.h"
+#include "put_bits.h"
+
+
+av_always_inline
+static unsigned int apv_read_vlc(GetBitContext *restrict gbc, int k_param,
+ const APVVLCLUT *restrict lut)
+{
+ unsigned int next_bits;
+ const APVSingleVLCLUTEntry *ent;
+
+ next_bits = show_bits(gbc, APV_VLC_LUT_BITS);
+ ent = &lut->single_lut[k_param][next_bits];
+
+ if (ent->more) {
+ unsigned int leading_zeroes;
+
+ skip_bits(gbc, ent->consume);
+
+ next_bits = show_bits(gbc, 16);
+ leading_zeroes = 15 - av_log2(next_bits);
+
+ if (leading_zeroes == 0) {
+ // This can't happen mid-stream because the lookup would
+ // have resolved a leading one into a shorter code, but it
+ // can happen if we are hitting the end of the buffer.
+ // Return an invalid code to propagate as an error.
+ return APV_MAX_TRANS_COEFF + 1;
+ }
+
+ skip_bits(gbc, leading_zeroes + 1);
+
+ return (2 << k_param) +
+ ((1 << leading_zeroes) - 1) * (1 << k_param) +
+ get_bits(gbc, leading_zeroes + k_param);
+ } else {
+ skip_bits(gbc, ent->consume);
+ return ent->result;
+ }
+}
void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut)
{
const int code_len = APV_VLC_LUT_BITS;
const int lut_size = APV_VLC_LUT_SIZE;
+ // Build the single-symbol VLC table.
for (int k = 0; k <= 5; k++) {
for (unsigned int code = 0; code < lut_size; code++) {
- APVVLCLUTEntry *ent = &decode_lut->lut[k][code];
+ APVSingleVLCLUTEntry *ent = &decode_lut->single_lut[k][code];
unsigned int first_bit = code & (1 << code_len - 1);
unsigned int remaining_bits = code ^ first_bit;
@@ -64,152 +104,310 @@ void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut)
}
}
}
-}
-
-av_always_inline
-static unsigned int apv_read_vlc(GetBitContext *gbc, int k_param,
- const APVVLCLUT *lut)
-{
- unsigned int next_bits;
- const APVVLCLUTEntry *ent;
- next_bits = show_bits(gbc, APV_VLC_LUT_BITS);
- ent = &lut->lut[k_param][next_bits];
-
- if (ent->more) {
- unsigned int leading_zeroes;
-
- skip_bits(gbc, ent->consume);
-
- next_bits = show_bits(gbc, 16);
- leading_zeroes = 15 - av_log2(next_bits);
-
- if (leading_zeroes == 0) {
- // This can't happen mid-stream because the lookup would
- // have resolved a leading one into a shorter code, but it
- // can happen if we are hitting the end of the buffer.
- // Return an invalid code to propagate as an error.
- return APV_MAX_TRANS_COEFF + 1;
+ // Build the multi-symbol VLC table.
+ for (int start_run = 0; start_run <= 2; start_run++) {
+ for (int start_level = 0; start_level <= 4; start_level++) {
+ for (unsigned int code = 0; code < lut_size; code++) {
+ APVMultiVLCLUTEntry *ent;
+ int k_run, k_level;
+ GetBitContext gbc;
+ PutBitContext pbc;
+ uint8_t buffer[16];
+ uint8_t run_first_buffer[16];
+ uint8_t level_first_buffer[16];
+
+ memset(buffer, 0, sizeof(buffer));
+ init_put_bits(&pbc, buffer, sizeof(buffer));
+ put_bits(&pbc, APV_VLC_LUT_BITS, code);
+ flush_put_bits(&pbc);
+
+ memcpy(run_first_buffer, buffer, sizeof(buffer));
+ memcpy(level_first_buffer, buffer, sizeof(buffer));
+
+ k_run = start_run;
+ k_level = start_level;
+
+ ent = &decode_lut->run_first_lut[k_run][k_level][code];
+ memset(ent, 0, sizeof(*ent));
+ init_get_bits8(&gbc, run_first_buffer, sizeof(run_first_buffer));
+
+ ent->count = 0;
+ for (int i = 0; i <= 1; i++) {
+ int value, sign, pos;
+
+ value = apv_read_vlc(&gbc, k_run, decode_lut);
+ pos = get_bits_count(&gbc);
+ if (pos > APV_VLC_LUT_BITS)
+ break;
+ ent->run[i] = value;
+ ent->offset[ent->count] = pos;
+ ++ent->count;
+ k_run = FFMIN(value >> 2, 2);
+
+ value = apv_read_vlc(&gbc, k_level, decode_lut);
+ sign = get_bits1(&gbc);
+ pos = get_bits_count(&gbc);
+ if (pos > APV_VLC_LUT_BITS)
+ break;
+ ++value;
+ ent->level[i] = sign ? -value : value;
+ ent->offset[ent->count] = pos;
+ ++ent->count;
+ k_level = FFMIN(value >> 2, 4);
+ if (i == 0)
+ ent->k_level_0 = k_level;
+ }
+ if (ent->count > 0 && ent->count < 4)
+ ent->offset[3] = ent->offset[ent->count - 1];
+ ent->k_run = k_run;
+ ent->k_level_1 = k_level;
+
+ k_run = start_run;
+ k_level = start_level;
+
+ ent = &decode_lut->level_first_lut[k_run][k_level][code];
+ memset(ent, 0, sizeof(*ent));
+ init_get_bits8(&gbc, level_first_buffer, sizeof(level_first_buffer));
+
+ ent->count = 0;
+ for (int i = 0; i <= 1; i++) {
+ int value, sign, pos;
+
+ value = apv_read_vlc(&gbc, k_level, decode_lut);
+ sign = get_bits1(&gbc);
+ pos = get_bits_count(&gbc);
+ if (pos > APV_VLC_LUT_BITS)
+ break;
+ ++value;
+ ent->level[i] = sign ? -value : value;
+ ent->offset[ent->count] = pos;
+ ++ent->count;
+ k_level = FFMIN(value >> 2, 4);
+ if (i == 0)
+ ent->k_level_0 = k_level;
+
+ value = apv_read_vlc(&gbc, k_run, decode_lut);
+ pos = get_bits_count(&gbc);
+ if (pos > APV_VLC_LUT_BITS)
+ break;
+ ent->run[i] = value;
+ ent->offset[ent->count] = pos;
+ ++ent->count;
+ k_run = FFMIN(value >> 2, 2);
+ }
+ if (ent->count > 0 && ent->count < 4)
+ ent->offset[3] = ent->offset[ent->count - 1];
+ ent->k_run = k_run;
+ ent->k_level_1 = k_level;
+ }
}
-
- skip_bits(gbc, leading_zeroes + 1);
-
- return (2 << k_param) +
- ((1 << leading_zeroes) - 1) * (1 << k_param) +
- get_bits(gbc, leading_zeroes + k_param);
- } else {
- skip_bits(gbc, ent->consume);
- return ent->result;
}
}
-unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param,
- const APVVLCLUT *lut)
-{
- return apv_read_vlc(gbc, k_param, lut);
-}
-
-int ff_apv_entropy_decode_block(int16_t *coeff,
- GetBitContext *gbc,
- APVEntropyState *state)
+int ff_apv_entropy_decode_block(int16_t *restrict coeff,
+ GetBitContext *restrict gbc,
+ APVEntropyState *restrict state)
{
const APVVLCLUT *lut = state->decode_lut;
- int k_param;
+ int scan_pos, next_is_run;
+ int k_dc = state->prev_k_dc;
+ int k_run = 0;
+ int k_level = state->prev_k_level;
+ int first_ac = 1;
- // DC coefficient.
+ // Read one DC coefficient.
{
- int abs_dc_coeff_diff;
- int sign_dc_coeff_diff;
- int dc_coeff;
-
- k_param = av_clip(state->prev_dc_diff >> 1, 0, 5);
- abs_dc_coeff_diff = apv_read_vlc(gbc, k_param, lut);
-
- if (abs_dc_coeff_diff > 0)
- sign_dc_coeff_diff = get_bits1(gbc);
- else
- sign_dc_coeff_diff = 0;
-
- if (sign_dc_coeff_diff)
- dc_coeff = state->prev_dc - abs_dc_coeff_diff;
- else
- dc_coeff = state->prev_dc + abs_dc_coeff_diff;
-
- if (dc_coeff < APV_MIN_TRANS_COEFF ||
- dc_coeff > APV_MAX_TRANS_COEFF) {
- av_log(state->log_ctx, AV_LOG_ERROR,
- "Out-of-range DC coefficient value: %d "
- "(from prev_dc %d abs_dc_coeff_diff %d sign_dc_coeff_diff %d)\n",
- dc_coeff, state->prev_dc, abs_dc_coeff_diff, sign_dc_coeff_diff);
- return AVERROR_INVALIDDATA;
+ int dc, abs_diff, sign;
+
+ abs_diff = apv_read_vlc(gbc, k_dc, lut);
+
+ if (abs_diff) {
+ sign = get_bits1(gbc);
+ if (sign)
+ dc = state->prev_dc - abs_diff;
+ else
+ dc = state->prev_dc + abs_diff;
+ } else {
+ dc = state->prev_dc;
}
- coeff[0] = dc_coeff;
+ coeff[0] = dc;
- state->prev_dc = dc_coeff;
- state->prev_dc_diff = abs_dc_coeff_diff;
+ state->prev_dc = dc;
+ state->prev_k_dc = FFMIN(abs_diff >> 1, 5);
}
- // AC coefficients.
- {
- int scan_pos = 1;
- int first_ac = 1;
- int prev_level = state->prev_1st_ac_level;
- int prev_run = 0;
-
- do {
- int coeff_zero_run;
-
- k_param = av_clip(prev_run >> 2, 0, 2);
- coeff_zero_run = apv_read_vlc(gbc, k_param, lut);
-
- if (coeff_zero_run > APV_BLK_COEFFS - scan_pos) {
- av_log(state->log_ctx, AV_LOG_ERROR,
- "Out-of-range zero-run value: %d (at scan pos %d)\n",
- coeff_zero_run, scan_pos);
- return AVERROR_INVALIDDATA;
- }
+ // Alternate reading run and level until reaching the end of
+ // the block.
+ scan_pos = 1;
+ next_is_run = 1;
+ while (1) {
+ uint32_t next_bits, lut_bits;
+ const APVMultiVLCLUTEntry *ent;
+
+ // Read 18 bits and look it up the first part in either the
+ // run-first or the level-first table. If the next code is too
+ // long the 18 bits will allow resolving a run code (up to 63)
+ // without reading any more bits, and will allow the exact
+ // length of a level code to be determined. (Note that the
+ // single-symbol LUT is never useful here as the multisymbol
+ // lookup has already determined that the code is too long.)
+
+ next_bits = show_bits(gbc, 18);
+ lut_bits = next_bits >> (18 - APV_VLC_LUT_BITS);
+
+ if (next_is_run) {
+
+ ent = &lut->run_first_lut[k_run][k_level][lut_bits];
+
+ if (ent->count == 0) {
+ // One long code.
+ uint32_t bits, low_bits;
+ unsigned int leading_zeroes, low_bit_count, low_bit_shift;
+ int run;
+
+ // Remove the prefix bits.
+ bits = next_bits & 0xffff;
+ // Determine code length.
+ leading_zeroes = 15 - av_log2(bits);
+ // Extract the low bits.
+ low_bit_count = leading_zeroes + k_run;
+ low_bit_shift = 16 - (1 + 2 * leading_zeroes + k_run);
+ low_bits = (bits >> low_bit_shift) & ((1 << low_bit_count) - 1);
+ // Construct run code.
+ run = (2 << k_run) +
+ ((1 << leading_zeroes) - 1) * (1 << k_run) +
+ low_bits;
+ // Skip over the bits just used.
+ skip_bits(gbc, 2 + leading_zeroes + 1 + low_bit_count);
+
+ scan_pos += run;
+ if (scan_pos >= 64)
+ break;
+ k_run = FFMIN(run >> 2, 2);
+ next_is_run = 0;
- for (int i = 0; i < coeff_zero_run; i++) {
- coeff[ff_zigzag_direct[scan_pos]] = 0;
- ++scan_pos;
- }
- prev_run = coeff_zero_run;
-
- if (scan_pos < APV_BLK_COEFFS) {
- int abs_ac_coeff_minus1;
- int sign_ac_coeff;
- int level;
-
- k_param = av_clip(prev_level >> 2, 0, 4);
- abs_ac_coeff_minus1 = apv_read_vlc(gbc, k_param, lut);
- sign_ac_coeff = get_bits(gbc, 1);
+ } else {
+ // One or more short codes.
- if (sign_ac_coeff)
- level = -abs_ac_coeff_minus1 - 1;
- else
- level = abs_ac_coeff_minus1 + 1;
-
- if (level < APV_MIN_TRANS_COEFF ||
- level > APV_MAX_TRANS_COEFF) {
- av_log(state->log_ctx, AV_LOG_ERROR,
- "Out-of-range AC coefficient value: %d "
- "(from prev_level %d abs_ac_coeff_minus1 %d sign_ac_coeff %d)\n",
- level, prev_level, abs_ac_coeff_minus1, sign_ac_coeff);
+ scan_pos += ent->run[0];
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[0]);
+ break;
}
+ if (ent->count > 1) {
+ coeff[ff_zigzag_direct[scan_pos]] = ent->level[0];
+ ++scan_pos;
+ if (first_ac) {
+ state->prev_k_level = ent->k_level_0;
+ first_ac = 0;
+ }
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[1]);
+ break;
+ }
+ }
+ if (ent->count > 2) {
+ scan_pos += ent->run[1];
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[2]);
+ break;
+ }
+ }
+ if (ent->count > 3) {
+ coeff[ff_zigzag_direct[scan_pos]] = ent->level[1];
+ ++scan_pos;
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[3]);
+ break;
+ }
+ }
+ skip_bits(gbc, ent->offset[3]);
+ k_run = ent->k_run;
+ k_level = ent->k_level_1;
+ next_is_run = !(ent->count & 1);
+ }
+ } else {
+
+ ent = &lut->level_first_lut[k_run][k_level][lut_bits];
+
+ if (ent->count == 0) {
+ // One long code.
+ uint32_t bits;
+ unsigned int leading_zeroes;
+ int level, abs_level, sign;
+
+ // Remove the prefix bits.
+ bits = next_bits & 0xffff;
+ // Determine code length.
+ leading_zeroes = 15 - av_log2(bits);
+ // Skip the prefix and length bits.
+ skip_bits(gbc, 2 + leading_zeroes + 1);
+ // Read the rest of the code and construct the level.
+ // Include the + 1 offset for nonzero value here.
+ abs_level = (2 << k_level) +
+ ((1 << leading_zeroes) - 1) * (1 << k_level) +
+ get_bits(gbc, leading_zeroes + k_level) + 1;
+
+ sign = get_bits(gbc, 1);
+ if (sign)
+ level = -abs_level;
+ else
+ level = abs_level;
coeff[ff_zigzag_direct[scan_pos]] = level;
-
- prev_level = abs_ac_coeff_minus1 + 1;
+ ++scan_pos;
+ k_level = FFMIN(abs_level >> 2, 4);
if (first_ac) {
- state->prev_1st_ac_level = prev_level;
+ state->prev_k_level = k_level;
first_ac = 0;
}
+ if (scan_pos >= 64)
+ break;
+ next_is_run = 1;
+
+ } else {
+ // One or more short codes.
+ coeff[ff_zigzag_direct[scan_pos]] = ent->level[0];
++scan_pos;
+ if (first_ac) {
+ state->prev_k_level = ent->k_level_0;
+ first_ac = 0;
+ }
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[0]);
+ break;
+ }
+ if (ent->count > 1) {
+ scan_pos += ent->run[0];
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[1]);
+ break;
+ }
+ }
+ if (ent->count > 2) {
+ coeff[ff_zigzag_direct[scan_pos]] = ent->level[1];
+ ++scan_pos;
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[2]);
+ break;
+ }
+ }
+ if (ent->count > 3) {
+ scan_pos += ent->run[1];
+ if (scan_pos >= 64) {
+ skip_bits(gbc, ent->offset[3]);
+ break;
+ }
+ }
+ skip_bits(gbc, ent->offset[3]);
+ k_run = ent->k_run;
+ k_level = ent->k_level_1;
+ next_is_run = ent->count & 1;
}
-
- } while (scan_pos < APV_BLK_COEFFS);
+ }
}
return 0;
--
2.47.2
More information about the ffmpeg-devel
mailing list