[FFmpeg-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.

Michael Niedermayer michaelni at gmx.at
Sat Feb 9 22:35:44 CET 2013


On Sat, Feb 09, 2013 at 10:23:35AM -0800, Ronald S. Bultje wrote:
> From: "Ronald S. Bultje" <rsbultje at gmail.com>
> 
> In case of no-transform, integrate it with put_pixels4/8(). Intra PCM
> is changed to not use h->mb anymore (saves a memcpy). The one during
> update_thread_context() init is removed by removing the memcpy() that
> clobbered it in the first place. Together, this makes the H264 decoder
> almost-independent of dsputil.
> 
> (PPC and Arm assembly not yet ported. Will port if the rest is OK'ed.)
> ---
>  libavcodec/get_bits.h              |   3 +-
>  libavcodec/h264.c                  |   7 ++-
>  libavcodec/h264.h                  |   1 +
>  libavcodec/h264_cabac.c            |   4 +-
>  libavcodec/h264_cavlc.c            |  10 ++--
>  libavcodec/h264_mb_template.c      |  20 +++----
>  libavcodec/h264idct_template.c     |  16 ++++--
>  libavcodec/h264pred.h              |   8 +--
>  libavcodec/h264pred_template.c     |  28 ++++++----
>  libavcodec/svq3.c                  |   2 +
>  libavcodec/x86/h264_idct.asm       | 108 ++++++++++++++++++++++++++++---------
>  libavcodec/x86/h264_idct_10bit.asm |  53 ++++++++++++++++--
>  12 files changed, 185 insertions(+), 75 deletions(-)
>

This fails h264-lossless too:
--- ./tests/ref/fate/h264-lossless  2013-02-09 22:21:42.881124947 +0100
+++ tests/data/fate/h264-lossless   2013-02-09 22:24:40.705128693 +0100
@@ -1,11 +1,11 @@
 #tb 0: 83333/5000000
-0,          0,          0,        1,   460800, 0x7731dd2f
-0,          2,          2,        1,   460800, 0x944b8c64
-0,          3,          3,        1,   460800, 0xbe833041
-0,          4,          4,        1,   460800, 0xbe95d96a
-0,          5,          5,        1,   460800, 0xfe7ea5e6
-0,          6,          6,        1,   460800, 0x381743c7
-0,          7,          7,        1,   460800, 0x63fcc2e9
-0,          8,          8,        1,   460800, 0x79574960
-0,          9,          9,        1,   460800, 0xdab9e18a
-0,         10,         10,        1,   460800, 0xd88e8fe8
+0,          0,          0,        1,   460800, 0xa40e9b4e
+0,          2,          2,        1,   460800, 0x3ac9f364
+0,          3,          3,        1,   460800, 0x38e73edf
+0,          4,          4,        1,   460800, 0x97bb08b0
+0,          5,          5,        1,   460800, 0xd16f4b6d
+0,          6,          6,        1,   460800, 0x00f724f4
+0,          7,          7,        1,   460800, 0x8b0d742b
+0,          8,          8,        1,   460800, 0xddda09dc
+0,          9,          9,        1,   460800, 0x7b7e80b5
+0,         10,         10,        1,   460800, 0xb3243355


more comments below:


> diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
> index 7129b17..f16a508 100644
> --- a/libavcodec/get_bits.h
> +++ b/libavcodec/get_bits.h
> @@ -415,11 +415,12 @@ static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer,
>      return init_get_bits(s, buffer, byte_size * 8);
>  }
>  
> -static inline void align_get_bits(GetBitContext *s)
> +static inline const uint8_t *align_get_bits(GetBitContext *s)
>  {
>      int n = -get_bits_count(s) & 7;
>      if (n)
>          skip_bits(s, n);
> +    return s->buffer + (s->index >> 3);
>  }
>

ok but should be a seperate patch/commit


[...]
> diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
> index 4a029a0..702dbc9 100644
> --- a/libavcodec/h264idct_template.c
> +++ b/libavcodec/h264idct_template.c
> @@ -79,6 +79,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
>          dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
>          dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
>      }
> +
> +    memset(block, 0, 16 * sizeof(dctcoef));
>  }
>  
>  void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
> @@ -151,14 +153,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
>          dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
>          dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
>      }
> +
> +    memset(block, 0, 64 * sizeof(dctcoef));
>  }

are there any architectures that use the C idct but have a optimized
clear_blocks ?
if not this is fine


[...]

> diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
> index e78f2d4..8d8d62e 100644
> --- a/libavcodec/h264pred_template.c
> +++ b/libavcodec/h264pred_template.c
> @@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
>  #undef PL
>  #undef SRC
>  
> -static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
>                                          ptrdiff_t stride)
>  {
>      int i;
> @@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
>          pix++;
>          block++;
>      }
> +
> +    memset(_block, 0, sizeof(dctcoef) * 16);
>  }
>  
> -static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
>                                            ptrdiff_t stride)
>  {
>      int i;
> @@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
>          pix+= stride;
>          block+= 4;
>      }
> +
> +    memset(_block, 0, sizeof(dctcoef) * 16);
>  }
>  
> -static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
>                                           ptrdiff_t stride)
>  {
>      int i;
> @@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
>          pix++;
>          block++;
>      }
> +
> +    memset(_block, 0, sizeof(dctcoef) * 64);
>  }
>  
> -static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
> +static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
>                                             ptrdiff_t stride)
>  {
>      int i;
> @@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
>          pix+= stride;
>          block+= 8;
>      }
> +
> +    memset(_block, 0, sizeof(dctcoef) * 64);
>  }
[...]
> diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
> index b79e69b..1dc2a6f 100644
> --- a/libavcodec/svq3.c
> +++ b/libavcodec/svq3.c
> @@ -210,6 +210,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block,
>          dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20));
>          dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20));
>      }
> +
> +    memset(block, 0, 16 * sizeof(int16_t));
>  }

are there any architectures that have an optimized clear_blocks but
dont have above functions optimized ?
if no then this is fine


also, do you have some benchmarks that show the effect of these
changes ?

overall the idea & patch is sure good

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No great genius has ever existed without some touch of madness. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130209/4e6d69ff/attachment.asc>


More information about the ffmpeg-devel mailing list