[FFmpeg-devel] [PATCH] A rather simple H.264 speed optimization
Michael Niedermayer
michaelni
Mon Jul 28 20:33:56 CEST 2008
On Mon, Jul 28, 2008 at 09:44:06AM -0600, Jason Garrett-Glaser wrote:
> On Mon, Jul 28, 2008 at 9:19 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Sun, Jul 27, 2008 at 10:50:29PM -0600, Jason Garrett-Glaser wrote:
> >> $subject, gains 6 clock cycles or so per decode_cabac_residual call on
> >> an ordinary source.
> >
> > ok
>
> applied
>
> round 2:
>
> I used the extra casts in the p[0] p[8] etc cases for clarity, to
> avoid changing the array indices.
>
> Index: libavcodec/h264.c
> ===================================================================
> --- libavcodec/h264.c (revision 14459)
> +++ libavcodec/h264.c (working copy)
> @@ -354,19 +354,15 @@
> if(USES_LIST(top_type, list)){
> const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
> const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
> - *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
> - *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
> - *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
> - *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]=
> *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
> + *(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]=
> *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0];
> + *(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]=
> *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];
This patch looks like it has been salted with \n ...
I doubt patch would accept that as input ;)
> h->ref_cache[list][scan8[0] + 0 - 1*8]=
> h->ref_cache[list][scan8[0] + 1 - 1*8]=
> s->current_picture.ref_index[list][b8_xy + 0];
> h->ref_cache[list][scan8[0] + 2 - 1*8]=
> h->ref_cache[list][scan8[0] + 3 - 1*8]=
> s->current_picture.ref_index[list][b8_xy + 1];
> }else{
> - *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
> - *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
> - *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
> - *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
> + *(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
> + *(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
> *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]=
> ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
> }
>
> @@ -428,15 +424,11 @@
> /* XXX beurk, Load mvd */
> if(USES_LIST(top_type, list)){
> const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
> - *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
> - *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
> - *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
> - *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 -
> 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
> + *(uint64_t*)h->mvd_cache[list][scan8[0] + 0 -
> 1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 0];
> + *(uint64_t*)h->mvd_cache[list][scan8[0] + 2 -
> 1*8]= *(uint64_t*)h->mvd_table[list][b_xy + 2];
How does this affect benchmark scores? [and on what cpu, 32/64?]
[...]
> @@ -5698,6 +5690,7 @@
> int mpx, mpy;
> int mx, my;
> const int index= 4*i + block_width*j;
> + uint32_t mv, mvd;
> int16_t (* mv_cache)[2]= &h->mv_cache[list][
> scan8[index] ];
> int16_t (* mvd_cache)[2]=
> &h->mvd_cache[list][ scan8[index] ];
> pred_motion(h, index, block_width, list,
> h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
> @@ -5706,40 +5699,26 @@
> my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
> tprintf(s->avctx, "final mv:%d %d\n", mx, my);
>
> + mv = pack16to32( mx, my );
> + mvd = pack16to32( mx - mpx, my - mpy );
> if(IS_SUB_8X8(sub_mb_type)){
> - mv_cache[ 1 ][0]=
> - mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
> - mv_cache[ 1 ][1]=
> - mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
> -
> - mvd_cache[ 1 ][0]=
> - mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
> - mvd_cache[ 1 ][1]=
> - mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
> + *(uint32_t*) mv_cache[1]=*(uint32_t*)
> mv_cache[8]=*(uint32_t*) mv_cache[9]= mv;
> +
> *(uint32_t*)mvd_cache[1]=*(uint32_t*)mvd_cache[8]=*(uint32_t*)mvd_cache[9]=
> mvd;
> }else if(IS_SUB_8X4(sub_mb_type)){
> - mv_cache[ 1 ][0]= mx;
> - mv_cache[ 1 ][1]= my;
> -
> - mvd_cache[ 1 ][0]= mx - mpx;
> - mvd_cache[ 1 ][1]= my - mpy;
> + *(uint32_t*) mv_cache[1]= mv;
> + *(uint32_t*)mvd_cache[1]= mvd;
> }else if(IS_SUB_4X8(sub_mb_type)){
> - mv_cache[ 8 ][0]= mx;
> - mv_cache[ 8 ][1]= my;
> -
> - mvd_cache[ 8 ][0]= mx - mpx;
> - mvd_cache[ 8 ][1]= my - mpy;
> + *(uint32_t*) mv_cache[8]= mv;
> + *(uint32_t*)mvd_cache[8]= mvd;
> }
> - mv_cache[ 0 ][0]= mx;
> - mv_cache[ 0 ][1]= my;
> -
> - mvd_cache[ 0 ][0]= mx - mpx;
> - mvd_cache[ 0 ][1]= my - mpy;
> + *(uint32_t*) mv_cache[0]= mv;
> + *(uint32_t*)mvd_cache[0]= mvd;
> }
This looks independant of the 32->64 change and should be in a seperate
patch. (its impossible to benchmark either correctly if they are combined)
> }else{
> - uint32_t *p= (uint32_t *)&h->mv_cache[list][
> scan8[4*i] ][0];
> - uint32_t *pd= (uint32_t *)&h->mvd_cache[list][
> scan8[4*i] ][0];
> + uint32_t *p = (uint32_t*)&h->mv_cache[list][
> scan8[4*i] ][0];
> + uint32_t *pd= (uint32_t*)&h->mvd_cache[list][
> scan8[4*i] ][0];
Did anything beside whitespace change in these?
[...]
> @@ -5908,7 +5887,7 @@
> }
> } else {
> uint8_t * const nnz= &h->non_zero_count_cache[
> scan8[4*i8x8] ];
> - nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
> + *(uint16_t*)&nnz[0] = *(uint16_t*)&nnz[8] = 0;
> }
> }
> }
> @@ -5933,14 +5912,14 @@
> }
> } else {
> uint8_t * const nnz= &h->non_zero_count_cache[0];
> - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8
> ] =nnz[ scan8[16]+9 ] =
> - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8
> ] =nnz[ scan8[20]+9 ] = 0;
> + *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
> + *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[
> scan8[20]+8 ] = 0;
> }
> } else {
> uint8_t * const nnz= &h->non_zero_count_cache[0];
> fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
> - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ]
> =nnz[ scan8[16]+9 ] =
> - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ]
> =nnz[ scan8[20]+9 ] = 0;
> + *(uint16_t*)&nnz[ scan8[16]+0 ] = *(uint16_t*)&nnz[ scan8[16]+8 ] =
> + *(uint16_t*)&nnz[ scan8[20]+0 ] = *(uint16_t*)&nnz[ scan8[20]+8 ] = 0;
> h->last_qscale_diff = 0;
> }
ok
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at mplayerhq.hu
> https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
>
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Republics decline into democracies and democracies degenerate into
despotisms. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080728/d64373a6/attachment.pgp>
More information about the ffmpeg-devel
mailing list