[FFmpeg-cvslog] hevc: Add coefficient limiting to speed up IDCT
Mickaël Raulet
git at videolan.org
Tue Jan 31 17:05:23 EET 2017
ffmpeg | branch: master | Mickaël Raulet <mraulet at insa-rennes.fr> | Tue Jul 5 18:52:38 2016 +0200| [cc16da75c2f99d92f7a6461100f041352deb6d88] | committer: Luca Barbato
hevc: Add coefficient limiting to speed up IDCT
Integrated to libav by Josh de Kock <josh at itanimul.li>.
Signed-off-by: Alexandra Hájková <alexandra at khirnov.net>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cc16da75c2f99d92f7a6461100f041352deb6d88
---
libavcodec/hevc.c | 12 ++++++++++--
libavcodec/hevcdsp.h | 2 +-
libavcodec/hevcdsp_template.c | 38 +++++++++++++++++++++++++-------------
3 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index d8c707b..5d58b52 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1218,8 +1218,16 @@ static void hls_residual_coding(HEVCContext *s, int x0, int y0,
int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
if (max_xy == 0)
s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
- else
- s->hevcdsp.idct[log2_trafo_size - 2](coeffs);
+ else {
+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+ if (max_xy < 4)
+ col_limit = FFMIN(4, col_limit);
+ else if (max_xy < 8)
+ col_limit = FFMIN(8, col_limit);
+ else if (max_xy < 12)
+ col_limit = FFMIN(24, col_limit);
+ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
+ }
}
}
s->hevcdsp.add_residual[log2_trafo_size - 2](dst, coeffs, stride);
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index bbc4cb2..199e5a9 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -46,7 +46,7 @@ typedef struct HEVCDSPContext {
void (*dequant)(int16_t *coeffs);
void (*transform_4x4_luma)(int16_t *coeffs);
- void (*idct[4])(int16_t *coeffs);
+ void (*idct[4])(int16_t *coeffs, int col_limit);
void (*idct_dc[4])(int16_t *coeffs);
void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 81e3ea5..076b251 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -137,7 +137,7 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
#undef TR_4x4_LUMA
-#define TR_4(dst, src, dstep, sstep, assign) \
+#define TR_4(dst, src, dstep, sstep, assign, end) \
do { \
const int e0 = transform[8 * 0][0] * src[0 * sstep] + \
transform[8 * 2][0] * src[2 * sstep]; \
@@ -154,15 +154,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
assign(dst[3 * dstep], e0 - o0); \
} while (0)
-#define TR_8(dst, src, dstep, sstep, assign) \
+#define TR_8(dst, src, dstep, sstep, assign, end) \
do { \
int i, j; \
int e_8[4]; \
int o_8[4] = { 0 }; \
for (i = 0; i < 4; i++) \
- for (j = 1; j < 8; j += 2) \
+ for (j = 1; j < end; j += 2) \
o_8[i] += transform[4 * j][i] * src[j * sstep]; \
- TR_4(e_8, src, 1, 2 * sstep, SET); \
+ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
\
for (i = 0; i < 4; i++) { \
assign(dst[i * dstep], e_8[i] + o_8[i]); \
@@ -170,15 +170,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
} \
} while (0)
-#define TR_16(dst, src, dstep, sstep, assign) \
+#define TR_16(dst, src, dstep, sstep, assign, end) \
do { \
int i, j; \
int e_16[8]; \
int o_16[8] = { 0 }; \
for (i = 0; i < 8; i++) \
- for (j = 1; j < 16; j += 2) \
+ for (j = 1; j < end; j += 2) \
o_16[i] += transform[2 * j][i] * src[j * sstep]; \
- TR_8(e_16, src, 1, 2 * sstep, SET); \
+ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
\
for (i = 0; i < 8; i++) { \
assign(dst[i * dstep], e_16[i] + o_16[i]); \
@@ -186,15 +186,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
} \
} while (0)
-#define TR_32(dst, src, dstep, sstep, assign) \
+#define TR_32(dst, src, dstep, sstep, assign, end) \
do { \
int i, j; \
int e_32[16]; \
int o_32[16] = { 0 }; \
for (i = 0; i < 16; i++) \
- for (j = 1; j < 32; j += 2) \
+ for (j = 1; j < end; j += 2) \
o_32[i] += transform[j][i] * src[j * sstep]; \
- TR_16(e_32, src, 1, 2 * sstep, SET); \
+ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \
\
for (i = 0; i < 16; i++) { \
assign(dst[i * dstep], e_32[i] + o_32[i]); \
@@ -202,23 +202,35 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
} \
} while (0)
+#define IDCT_VAR4(H) \
+ int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H) \
+ int limit = FFMIN(col_limit, H); \
+ int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H) IDCT_VAR8(H)
+#define IDCT_VAR32(H) IDCT_VAR8(H)
+
#define IDCT(H) \
-static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs) \
+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
+ int col_limit) \
{ \
int i; \
int shift = 7; \
int add = 1 << (shift - 1); \
int16_t *src = coeffs; \
+ IDCT_VAR ## H(H); \
\
for (i = 0; i < H; i++) { \
- TR_ ## H(src, src, H, H, SCALE); \
+ TR_ ## H(src, src, H, H, SCALE, limit2); \
+ if (limit2 < H && i%4 == 0 && !!i) \
+ limit2 -= 4; \
src++; \
} \
\
shift = 20 - BIT_DEPTH; \
add = 1 << (shift - 1); \
for (i = 0; i < H; i++) { \
- TR_ ## H(coeffs, coeffs, 1, 1, SCALE); \
+ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
coeffs += H; \
} \
}
More information about the ffmpeg-cvslog
mailing list