[FFmpeg-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.
Michael Niedermayer
michaelni at gmx.at
Sat Feb 9 22:35:44 CET 2013
On Sat, Feb 09, 2013 at 10:23:35AM -0800, Ronald S. Bultje wrote:
> From: "Ronald S. Bultje" <rsbultje at gmail.com>
>
> In case of no-transform, integrate it with put_pixels4/8(). Intra PCM
> is changed to not use h->mb anymore (saves a memcpy). The one during
> update_thread_context() init is removed by removing the memcpy() that
> clobbered it in the first place. Together, this makes the H264 decoder
> almost-independent of dsputil.
>
> (PPC and Arm assembly not yet ported. Will port if the rest is OK'ed.)
> ---
> libavcodec/get_bits.h | 3 +-
> libavcodec/h264.c | 7 ++-
> libavcodec/h264.h | 1 +
> libavcodec/h264_cabac.c | 4 +-
> libavcodec/h264_cavlc.c | 10 ++--
> libavcodec/h264_mb_template.c | 20 +++----
> libavcodec/h264idct_template.c | 16 ++++--
> libavcodec/h264pred.h | 8 +--
> libavcodec/h264pred_template.c | 28 ++++++----
> libavcodec/svq3.c | 2 +
> libavcodec/x86/h264_idct.asm | 108 ++++++++++++++++++++++++++++---------
> libavcodec/x86/h264_idct_10bit.asm | 53 ++++++++++++++++--
> 12 files changed, 185 insertions(+), 75 deletions(-)
>
This fails h264-lossless too:
--- ./tests/ref/fate/h264-lossless 2013-02-09 22:21:42.881124947 +0100
+++ tests/data/fate/h264-lossless 2013-02-09 22:24:40.705128693 +0100
@@ -1,11 +1,11 @@
#tb 0: 83333/5000000
-0, 0, 0, 1, 460800, 0x7731dd2f
-0, 2, 2, 1, 460800, 0x944b8c64
-0, 3, 3, 1, 460800, 0xbe833041
-0, 4, 4, 1, 460800, 0xbe95d96a
-0, 5, 5, 1, 460800, 0xfe7ea5e6
-0, 6, 6, 1, 460800, 0x381743c7
-0, 7, 7, 1, 460800, 0x63fcc2e9
-0, 8, 8, 1, 460800, 0x79574960
-0, 9, 9, 1, 460800, 0xdab9e18a
-0, 10, 10, 1, 460800, 0xd88e8fe8
+0, 0, 0, 1, 460800, 0xa40e9b4e
+0, 2, 2, 1, 460800, 0x3ac9f364
+0, 3, 3, 1, 460800, 0x38e73edf
+0, 4, 4, 1, 460800, 0x97bb08b0
+0, 5, 5, 1, 460800, 0xd16f4b6d
+0, 6, 6, 1, 460800, 0x00f724f4
+0, 7, 7, 1, 460800, 0x8b0d742b
+0, 8, 8, 1, 460800, 0xddda09dc
+0, 9, 9, 1, 460800, 0x7b7e80b5
+0, 10, 10, 1, 460800, 0xb3243355
more comments below:
> diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
> index 7129b17..f16a508 100644
> --- a/libavcodec/get_bits.h
> +++ b/libavcodec/get_bits.h
> @@ -415,11 +415,12 @@ static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer,
> return init_get_bits(s, buffer, byte_size * 8);
> }
>
> -static inline void align_get_bits(GetBitContext *s)
> +static inline const uint8_t *align_get_bits(GetBitContext *s)
> {
> int n = -get_bits_count(s) & 7;
> if (n)
> skip_bits(s, n);
> + return s->buffer + (s->index >> 3);
> }
>
ok but should be a seperate patch/commit
[...]
> diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
> index 4a029a0..702dbc9 100644
> --- a/libavcodec/h264idct_template.c
> +++ b/libavcodec/h264idct_template.c
> @@ -79,6 +79,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
> dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
> dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
> }
> +
> + memset(block, 0, 16 * sizeof(dctcoef));
> }
>
> void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
> @@ -151,14 +153,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
> dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
> dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
> }
> +
> + memset(block, 0, 64 * sizeof(dctcoef));
> }
are there any architectures that use the C idct but have a optimized
clear_blocks ?
if not this is fine
[...]
> diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
> index e78f2d4..8d8d62e 100644
> --- a/libavcodec/h264pred_template.c
> +++ b/libavcodec/h264pred_template.c
> @@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
> #undef PL
> #undef SRC
>
> -static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
> ptrdiff_t stride)
> {
> int i;
> @@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
> pix++;
> block++;
> }
> +
> + memset(_block, 0, sizeof(dctcoef) * 16);
> }
>
> -static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
> ptrdiff_t stride)
> {
> int i;
> @@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> pix+= stride;
> block+= 4;
> }
> +
> + memset(_block, 0, sizeof(dctcoef) * 16);
> }
>
> -static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
> ptrdiff_t stride)
> {
> int i;
> @@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
> pix++;
> block++;
> }
> +
> + memset(_block, 0, sizeof(dctcoef) * 64);
> }
>
> -static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
> ptrdiff_t stride)
> {
> int i;
> @@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> pix+= stride;
> block+= 8;
> }
> +
> + memset(_block, 0, sizeof(dctcoef) * 64);
> }
[...]
> diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
> index b79e69b..1dc2a6f 100644
> --- a/libavcodec/svq3.c
> +++ b/libavcodec/svq3.c
> @@ -210,6 +210,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block,
> dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20));
> dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20));
> }
> +
> + memset(block, 0, 16 * sizeof(int16_t));
> }
are there any architectures that have an optimized clear_blocks but
dont have above functions optimized ?
if no then this is fine
also, do you have some benchmarks that show the effect of these
changes ?
overall the idea & patch is sure good
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
No great genius has ever existed without some touch of madness. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130209/4e6d69ff/attachment.asc>
More information about the ffmpeg-devel
mailing list