[FFmpeg-devel] [PATCH 2/5] avcodec/vc1: optimize block functions

zhaoxiu.zeng zhaoxiu.zeng at gmail.com
Tue Feb 24 17:28:45 CET 2015


在 2015/2/19 2:12, Michael Niedermayer 写道:
> On Sat, Feb 14, 2015 at 10:56:28PM +0800, zhaoxiu.zeng wrote:
>> From 960eca51e6e65e6969f7d829e29ddc2387420733 Mon Sep 17 00:00:00 2001
>> From: Zeng Zhaoxiu <zhaoxiu.zeng at gmail.com>
>> Date: Sat, 14 Feb 2015 19:46:51 +0800
>> Subject: [PATCH 2/5] avcodec/vc1: optimize block functions
> 
> This patch breaks make fate:
> also please include benchmark values
> 
> --- ./tests/ref/fate/mss2-wmv   2015-02-12 16:53:42.138771890 +0100
> +++ tests/data/fate/mss2-wmv    2015-02-18 19:10:41.449866314 +0100
> @@ -27,11 +27,11 @@
>  0,         31,         31,        1,   230400, 0x18a2f97a
>  0,         32,         32,        1,   230400, 0xf9e82961
>  0,         33,         33,        1,   230400, 0x57a8e9e8
> -0,         34,         34,        1,   230400, 0xdef6fd66
> -0,         35,         35,        1,   230400, 0xc7d923a9
> -0,         36,         36,        1,   230400, 0x08bb41ee
> -0,         37,         37,        1,   230400, 0x43ccbd29
> -0,         38,         38,        1,   230400, 0x46666ee3
> +0,         34,         34,        1,   230400, 0xbd60fd5a
> +0,         35,         35,        1,   230400, 0x289b2391
> +0,         36,         36,        1,   230400, 0x8d5a4205
> +0,         37,         37,        1,   230400, 0xbed1bcb6
> +0,         38,         38,        1,   230400, 0x24086ea2
>  0,         39,         39,        1,   230400, 0xbfd2ef29
>  0,         40,         40,        1,   230400, 0x6504545f
>  0,         41,         41,        1,   230400, 0x8fb86901
> @@ -40,25 +40,25 @@
>  0,         44,         44,        1,   230400, 0xf808106b
>  0,         45,         45,        1,   230400, 0x34150020
>  0,         46,         46,        1,   230400, 0x50fdfe89
> -0,         47,         47,        1,   230400, 0x920b7708
> +0,         47,         47,        1,   230400, 0xe8287631
>  0,         48,         48,        1,   230400, 0xed64fcc4
>  0,         49,         49,        1,   230400, 0x6291a170
>  0,         50,         50,        1,   230400, 0x20524643
> -0,         51,         51,        1,   230400, 0x92aafecd
> +0,         51,         51,        1,   230400, 0x5e9efe62
>  0,         52,         52,        1,   230400, 0xf00ee14d
>  0,         53,         53,        1,   230400, 0xfa3113ea
>  0,         54,         54,        1,   230400, 0x99c06df1
>  0,         55,         55,        1,   230400, 0x625c6918
> -0,         56,         56,        1,   230400, 0xb277b25e
> +0,         56,         56,        1,   230400, 0xeb34b22e
>  0,         57,         57,        1,   230400, 0x2e913006
>  0,         58,         58,        1,   230400, 0x3f6f1d99
>  0,         59,         59,        1,   230400, 0x100ab60f
> -0,         60,         60,        1,   230400, 0x9b73d0bf
> +0,         60,         60,        1,   230400, 0xe21acfc4
>  0,         61,         61,        1,   230400, 0xda0df2ce
>  0,         62,         62,        1,   230400, 0x67f7ca24
>  0,         63,         63,        1,   230400, 0xbde9b3d0
>  0,         64,         64,        1,   230400, 0x92e14d07
> -0,         65,         65,        1,   230400, 0x9426c3d9
> +0,         65,         65,        1,   230400, 0xa584c3c4
>  0,         66,         66,        1,   230400, 0x6104be70
>  0,         67,         67,        1,   230400, 0xc4d1078a
>  0,         68,         68,        1,   230400, 0x89426a42
> @@ -67,35 +67,35 @@
>  0,         71,         71,        1,   230400, 0x4249b8c6
>  0,         72,         72,        1,   230400, 0x4b88cad3
>  0,         73,         73,        1,   230400, 0x76af545d
> -0,         74,         74,        1,   230400, 0xfe47e3c4
> +0,         74,         74,        1,   230400, 0xb165e37d
>  0,         75,         75,        1,   230400, 0xa2e0e721
>  0,         76,         76,        1,   230400, 0xde974a42
> -0,         77,         77,        1,   230400, 0x87bf38ba
> +0,         77,         77,        1,   230400, 0x3dad37cc
>  0,         78,         78,        1,   230400, 0xd52318fd
>  0,         79,         79,        1,   230400, 0x0bbb1526
> -0,         80,         80,        1,   230400, 0xa22c5e5e
> +0,         80,         80,        1,   230400, 0xe85b5e88
>  0,         81,         81,        1,   230400, 0x4532c5d2
>  0,         82,         82,        1,   230400, 0x88b560ec
> -0,         83,         83,        1,   230400, 0xcee9d9c9
> +0,         83,         83,        1,   230400, 0xeddad96d
>  0,         84,         84,        1,   230400, 0x0429358f
> -0,         85,         85,        1,   230400, 0xf18a9b98
> -0,         86,         86,        1,   230400, 0x63f7a12c
> -0,         87,         87,        1,   230400, 0x98635515
> +0,         85,         85,        1,   230400, 0xaee09b6d
> +0,         86,         86,        1,   230400, 0xce98a02b
> +0,         87,         87,        1,   230400, 0x127654f4
>  0,         88,         88,        1,   230400, 0x36affebc
>  0,         89,         89,        1,   230400, 0xd8c19629
> -0,         90,         90,        1,   230400, 0x9ef5344d
> -0,         91,         91,        1,   230400, 0x545668dc
> -0,         92,         92,        1,   230400, 0x50e65e74
> -0,         93,         93,        1,   230400, 0xe3258be3
> -0,         94,         94,        1,   230400, 0xeb479e1b
> -0,         95,         95,        1,   230400, 0x91894243
> -0,         96,         96,        1,   230400, 0x3c5660fc
> -0,         97,         97,        1,   230400, 0xf0c35673
> +0,         90,         90,        1,   230400, 0xfaac34dd
> +0,         91,         91,        1,   230400, 0x552568d9
> +0,         92,         92,        1,   230400, 0xc0015fad
> +0,         93,         93,        1,   230400, 0x50778be0
> +0,         94,         94,        1,   230400, 0x5d569f88
> +0,         95,         95,        1,   230400, 0xe2c5424a
> +0,         96,         96,        1,   230400, 0x72d6631f
> +0,         97,         97,        1,   230400, 0x64e656b2
>  0,         98,         98,        1,   230400, 0x552832e8
>  0,         99,         99,        1,   230400, 0x1970f2b1
> -0,        100,        100,        1,   230400, 0x812d4c91
> +0,        100,        100,        1,   230400, 0x464549e2
>  0,        101,        101,        1,   230400, 0xa3fbd4ef
> -0,        102,        102,        1,   230400, 0x486f9649
> +0,        102,        102,        1,   230400, 0xc44493bc
>  0,        103,        103,        1,   230400, 0x850f315a
> -0,        104,        104,        1,   230400, 0xc18ec66b
> -0,        105,        105,        1,   230400, 0xc9ef266e
> +0,        104,        104,        1,   230400, 0xa3a4c41c
> +0,        105,        105,        1,   230400, 0x0f5523c7
> Test mss2-wmv failed. Look at tests/data/fate/mss2-wmv.err for details.
> make: *** [fate-mss2-wmv] Error 1
> make: *** Waiting for unfinished jobs....
> 
> [...]
> 
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
>From f55454c496ff8d372528dcc61536f3a575de3c4b Mon Sep 17 00:00:00 2001
From: Zeng Zhaoxiu <zhaoxiu.zeng at gmail.com>
Date: Wed, 25 Feb 2015 00:04:20 +0800
Subject: [PATCH 1/1] vcodec/vc1: optimize block functions

I tested on fedora21-x86_64 (running on vmware workstation, windows7 32bits).
Follwing are the results:

ffplay fate-suite/vc1/SA10143.vc1

before:
109760 decicycles in vc1_decode_i_block_adv, 1 runs, 0 skips
83680 decicycles in vc1_decode_i_block_adv, 2 runs, 0 skips
54310 decicycles in vc1_decode_i_block_adv, 4 runs, 0 skips
44165 decicycles in vc1_decode_i_block_adv, 8 runs, 0 skips
30712 decicycles in vc1_decode_i_block_adv, 16 runs, 0 skips
22447 decicycles in vc1_decode_i_block_adv, 32 runs, 0 skips
14320 decicycles in vc1_decode_i_block_adv, 64 runs, 0 skips
9794 decicycles in vc1_decode_i_block_adv, 128 runs, 0 skips
6982 decicycles in vc1_decode_i_block_adv, 256 runs, 0 skips
6472 decicycles in vc1_decode_i_block_adv, 512 runs, 0 skips
7362 decicycles in vc1_decode_i_block_adv, 1024 runs, 0 skips
9127 decicycles in vc1_decode_i_block_adv, 2047 runs, 1 skips
63200 decicycles in vc1_decode_p_block, 1 runs, 0 skips
53260 decicycles in vc1_decode_p_block, 2 runs, 0 skips
35307480 decicycles in vc1_decode_p_mb_intfi, 1 runs, 0 skips
45460 decicycles in vc1_decode_p_block, 4 runs, 0 skips
18741780 decicycles in vc1_decode_p_mb_intfi, 2 runs, 0 skips
57960 decicycles in vc1_decode_intra_block, 1 runs, 0 skips
39140 decicycles in vc1_decode_intra_block, 2 runs, 0 skips
32020 decicycles in vc1_decode_intra_block, 4 runs, 0 skips
32145 decicycles in vc1_decode_p_block, 8 runs, 0 skips
11380690 decicycles in vc1_decode_p_mb_intfi, 4 runs, 0 skips
25795 decicycles in vc1_decode_intra_block, 8 runs, 0 skips
20387 decicycles in vc1_decode_intra_block, 16 runs, 0 skips
6241875 decicycles in vc1_decode_p_mb_intfi, 8 runs, 0 skips
21740 decicycles in vc1_decode_p_block, 16 runs, 0 skips
13972 decicycles in vc1_decode_intra_block, 32 runs, 0 skips
3401612 decicycles in vc1_decode_p_mb_intfi, 16 runs, 0 skips
9278 decicycles in vc1_decode_intra_block, 64 runs, 0 skips
1823050 decicycles in vc1_decode_p_mb_intfi, 32 runs, 0 skips
7602 decicycles in vc1_decode_intra_block, 128 runs, 0 skips
15457 decicycles in vc1_decode_p_block, 32 runs, 0 skips
990403 decicycles in vc1_decode_p_mb_intfi, 63 runs, 1 skips
11653 decicycles in vc1_decode_p_block, 64 runs, 0 skips
8031 decicycles in vc1_decode_intra_block, 256 runs, 0 skips
549345 decicycles in vc1_decode_p_mb_intfi, 127 runs, 1 skips
9518 decicycles in vc1_decode_p_block, 128 runs, 0 skips
10095 decicycles in vc1_decode_p_block, 256 runs, 0 skips
319886 decicycles in vc1_decode_p_mb_intfi, 255 runs, 1 skips
8981 decicycles in vc1_decode_intra_block, 512 runs, 0 skips
11607 decicycles in vc1_decode_p_block, 512 runs, 0 skips
13843 decicycles in vc1_decode_p_block, 1024 runs, 0 skips
222799 decicycles in vc1_decode_p_mb_intfi, 508 runs, 4 skips
13811 decicycles in vc1_decode_intra_block, 1024 runs, 0 skips
147749 decicycles in vc1_decode_p_mb_intfi, 1019 runs, 5 skips
13281 decicycles in vc1_decode_intra_block, 2047 runs, 1 skips
13434 decicycles in vc1_decode_p_block, 2048 runs, 0 skips
327920 decicycles in vc1_decode_b_mb_intfi, 1 runs, 0 skips
2382940 decicycles in vc1_decode_b_mb_intfi, 2 runs, 0 skips
1242270 decicycles in vc1_decode_b_mb_intfi, 4 runs, 0 skips
646465 decicycles in vc1_decode_b_mb_intfi, 8 runs, 0 skips
340417 decicycles in vc1_decode_b_mb_intfi, 16 runs, 0 skips
192212 decicycles in vc1_decode_b_mb_intfi, 32 runs, 0 skips
113155 decicycles in vc1_decode_b_mb_intfi, 64 runs, 0 skips
70909 decicycles in vc1_decode_b_mb_intfi, 128 runs, 0 skips
53167 decicycles in vc1_decode_b_mb_intfi, 255 runs, 1 skips
12262 decicycles in vc1_decode_p_block, 4096 runs, 0 skips
59743 decicycles in vc1_decode_b_mb_intfi, 509 runs, 3 skips
50252 decicycles in vc1_decode_b_mb_intfi, 1021 runs, 3 skips
12524 decicycles in vc1_decode_intra_block, 4094 runs, 2 skips
113632 decicycles in vc1_decode_p_mb_intfi, 2040 runs, 8 skips
11518 decicycles in vc1_decode_p_block, 8191 runs, 1 skips
51172 decicycles in vc1_decode_b_mb_intfi, 2045 runs, 3 skips
93654 decicycles in vc1_decode_p_mb_intfi, 4079 runs, 17 skips
11403 decicycles in vc1_decode_intra_block, 8188 runs, 4 skips
10747 decicycles in vc1_decode_p_block, 16381 runs, 3 skips
47553 decicycles in vc1_decode_b_mb_intfi, 4089 runs, 7 skips
10803 decicycles in vc1_decode_intra_block, 16374 runs, 10 skips
81273 decicycles in vc1_decode_p_mb_intfi, 8163 runs, 29 skips
44723 decicycles in vc1_decode_b_mb_intfi, 8175 runs, 17 skips
10169 decicycles in vc1_decode_p_block, 32755 runs, 13 skips
9893 decicycles in vc1_decode_intra_block, 32749 runs, 19 skips
72528 decicycles in vc1_decode_p_mb_intfi, 16338 runs, 46 skips
39796 decicycles in vc1_decode_b_mb_intfi, 16351 runs, 33 skips

after:
104200 decicycles in vc1_decode_i_block_adv, 1 runs, 0 skips
84300 decicycles in vc1_decode_i_block_adv, 2 runs, 0 skips
51940 decicycles in vc1_decode_i_block_adv, 4 runs, 0 skips
42210 decicycles in vc1_decode_i_block_adv, 8 runs, 0 skips
29867 decicycles in vc1_decode_i_block_adv, 16 runs, 0 skips
21970 decicycles in vc1_decode_i_block_adv, 32 runs, 0 skips
14093 decicycles in vc1_decode_i_block_adv, 64 runs, 0 skips
9542 decicycles in vc1_decode_i_block_adv, 128 runs, 0 skips
6809 decicycles in vc1_decode_i_block_adv, 256 runs, 0 skips
6349 decicycles in vc1_decode_i_block_adv, 512 runs, 0 skips
7303 decicycles in vc1_decode_i_block_adv, 1024 runs, 0 skips
8936 decicycles in vc1_decode_i_block_adv, 2048 runs, 0 skips
57120 decicycles in vc1_decode_p_block, 1 runs, 0 skips
52000 decicycles in vc1_decode_p_block, 2 runs, 0 skips
27188760 decicycles in vc1_decode_p_mb_intfi, 1 runs, 0 skips
47030 decicycles in vc1_decode_p_block, 4 runs, 0 skips
14698180 decicycles in vc1_decode_p_mb_intfi, 2 runs, 0 skips
46520 decicycles in vc1_decode_intra_block, 1 runs, 0 skips
32760 decicycles in vc1_decode_intra_block, 2 runs, 0 skips
29860 decicycles in vc1_decode_intra_block, 4 runs, 0 skips
34490 decicycles in vc1_decode_p_block, 8 runs, 0 skips
9363940 decicycles in vc1_decode_p_mb_intfi, 4 runs, 0 skips
25265 decicycles in vc1_decode_intra_block, 8 runs, 0 skips
19467 decicycles in vc1_decode_intra_block, 16 runs, 0 skips
5225080 decicycles in vc1_decode_p_mb_intfi, 8 runs, 0 skips
23847 decicycles in vc1_decode_p_block, 16 runs, 0 skips
13388 decicycles in vc1_decode_intra_block, 32 runs, 0 skips
2888357 decicycles in vc1_decode_p_mb_intfi, 16 runs, 0 skips
8846 decicycles in vc1_decode_intra_block, 64 runs, 0 skips
1530398 decicycles in vc1_decode_p_mb_intfi, 32 runs, 0 skips
7410 decicycles in vc1_decode_intra_block, 128 runs, 0 skips
15976 decicycles in vc1_decode_p_block, 32 runs, 0 skips
857584 decicycles in vc1_decode_p_mb_intfi, 64 runs, 0 skips
11717 decicycles in vc1_decode_p_block, 64 runs, 0 skips
7667 decicycles in vc1_decode_intra_block, 256 runs, 0 skips
485425 decicycles in vc1_decode_p_mb_intfi, 128 runs, 0 skips
9558 decicycles in vc1_decode_p_block, 128 runs, 0 skips
9939 decicycles in vc1_decode_p_block, 256 runs, 0 skips
287999 decicycles in vc1_decode_p_mb_intfi, 256 runs, 0 skips
8808 decicycles in vc1_decode_intra_block, 512 runs, 0 skips
11288 decicycles in vc1_decode_p_block, 512 runs, 0 skips
13544 decicycles in vc1_decode_p_block, 1024 runs, 0 skips
208198 decicycles in vc1_decode_p_mb_intfi, 510 runs, 2 skips
13573 decicycles in vc1_decode_intra_block, 1024 runs, 0 skips
138593 decicycles in vc1_decode_p_mb_intfi, 1021 runs, 3 skips
13239 decicycles in vc1_decode_intra_block, 2048 runs, 0 skips
13184 decicycles in vc1_decode_p_block, 2048 runs, 0 skips
337160 decicycles in vc1_decode_b_mb_intfi, 1 runs, 0 skips
309520 decicycles in vc1_decode_b_mb_intfi, 2 runs, 0 skips
200250 decicycles in vc1_decode_b_mb_intfi, 4 runs, 0 skips
126440 decicycles in vc1_decode_b_mb_intfi, 8 runs, 0 skips
80085 decicycles in vc1_decode_b_mb_intfi, 16 runs, 0 skips
63417 decicycles in vc1_decode_b_mb_intfi, 32 runs, 0 skips
48461 decicycles in vc1_decode_b_mb_intfi, 64 runs, 0 skips
37764 decicycles in vc1_decode_b_mb_intfi, 128 runs, 0 skips
36558 decicycles in vc1_decode_b_mb_intfi, 255 runs, 1 skips
12089 decicycles in vc1_decode_p_block, 4095 runs, 1 skips
50633 decicycles in vc1_decode_b_mb_intfi, 510 runs, 2 skips
44833 decicycles in vc1_decode_b_mb_intfi, 1022 runs, 2 skips
12613 decicycles in vc1_decode_intra_block, 4095 runs, 1 skips
109006 decicycles in vc1_decode_p_mb_intfi, 2042 runs, 6 skips
11326 decicycles in vc1_decode_p_block, 8191 runs, 1 skips
47920 decicycles in vc1_decode_b_mb_intfi, 2046 runs, 2 skips
90381 decicycles in vc1_decode_p_mb_intfi, 4090 runs, 6 skips
11445 decicycles in vc1_decode_intra_block, 8191 runs, 1 skips
10592 decicycles in vc1_decode_p_block, 16383 runs, 1 skips
45227 decicycles in vc1_decode_b_mb_intfi, 4094 runs, 2 skips
10848 decicycles in vc1_decode_intra_block, 16380 runs, 4 skips
79049 decicycles in vc1_decode_p_mb_intfi, 8183 runs, 9 skips
42179 decicycles in vc1_decode_b_mb_intfi, 8190 runs, 2 skips
10011 decicycles in vc1_decode_p_block, 32765 runs, 3 skips
9995 decicycles in vc1_decode_intra_block, 32760 runs, 8 skips
71169 decicycles in vc1_decode_p_mb_intfi, 16369 runs, 15 skips
38272 decicycles in vc1_decode_b_mb_intfi, 16372 runs, 12 skips

ffplay fate-suite/vc1/SA20021.vc1

before:
120560 decicycles in vc1_decode_i_block_adv, 1 runs, 0 skips
142580 decicycles in vc1_decode_i_block_adv, 2 runs, 0 skips
101460 decicycles in vc1_decode_i_block_adv, 4 runs, 0 skips
104735 decicycles in vc1_decode_i_block_adv, 8 runs, 0 skips
75700 decicycles in vc1_decode_i_block_adv, 16 runs, 0 skips
51305 decicycles in vc1_decode_i_block_adv, 32 runs, 0 skips
34807 decicycles in vc1_decode_i_block_adv, 64 runs, 0 skips
26759 decicycles in vc1_decode_i_block_adv, 128 runs, 0 skips
21359 decicycles in vc1_decode_i_block_adv, 255 runs, 1 skips
18802 decicycles in vc1_decode_i_block_adv, 511 runs, 1 skips
18105 decicycles in vc1_decode_i_block_adv, 1015 runs, 9 skips
16806 decicycles in vc1_decode_i_block_adv, 2038 runs, 10 skips
13855 decicycles in vc1_decode_i_block_adv, 4085 runs, 11 skips
74000 decicycles in vc1_decode_p_block, 1 runs, 0 skips
56900 decicycles in vc1_decode_p_block, 2 runs, 0 skips
52840 decicycles in vc1_decode_p_block, 4 runs, 0 skips
2556880 decicycles in vc1_decode_p_mb, 1 runs, 0 skips
94680 decicycles in vc1_decode_intra_block, 1 runs, 0 skips
66640 decicycles in vc1_decode_intra_block, 2 runs, 0 skips
46950 decicycles in vc1_decode_intra_block, 4 runs, 0 skips
1733940 decicycles in vc1_decode_p_mb, 2 runs, 0 skips
33230 decicycles in vc1_decode_p_block, 8 runs, 0 skips
33640 decicycles in vc1_decode_intra_block, 8 runs, 0 skips
1047390 decicycles in vc1_decode_p_mb, 4 runs, 0 skips
21462 decicycles in vc1_decode_p_block, 16 runs, 0 skips
24275 decicycles in vc1_decode_intra_block, 16 runs, 0 skips
636020 decicycles in vc1_decode_p_mb, 8 runs, 0 skips
17845 decicycles in vc1_decode_p_block, 32 runs, 0 skips
19492 decicycles in vc1_decode_intra_block, 32 runs, 0 skips
405710 decicycles in vc1_decode_p_mb, 16 runs, 0 skips
18106 decicycles in vc1_decode_intra_block, 64 runs, 0 skips
14416 decicycles in vc1_decode_p_block, 64 runs, 0 skips
287747 decicycles in vc1_decode_p_mb, 32 runs, 0 skips
15117 decicycles in vc1_decode_intra_block, 128 runs, 0 skips
12311 decicycles in vc1_decode_p_block, 128 runs, 0 skips
199671 decicycles in vc1_decode_p_mb, 63 runs, 1 skips
12412 decicycles in vc1_decode_p_block, 256 runs, 0 skips
16608 decicycles in vc1_decode_intra_block, 256 runs, 0 skips
171804 decicycles in vc1_decode_p_mb, 125 runs, 3 skips
13926 decicycles in vc1_decode_p_block, 512 runs, 0 skips
18409 decicycles in vc1_decode_intra_block, 512 runs, 0 skips
152801 decicycles in vc1_decode_p_mb, 251 runs, 5 skips
11951 decicycles in vc1_decode_p_block, 1024 runs, 0 skips
15527 decicycles in vc1_decode_intra_block, 1024 runs, 0 skips
118882 decicycles in vc1_decode_p_mb, 505 runs, 7 skips
12140 decicycles in vc1_decode_p_block, 2048 runs, 0 skips
16176 decicycles in vc1_decode_intra_block, 2046 runs, 2 skips
130388 decicycles in vc1_decode_p_mb, 1012 runs, 12 skips
14547 decicycles in vc1_decode_p_block, 4093 runs, 3 skips
15390 decicycles in vc1_decode_intra_block, 4091 runs, 5 skips
116925 decicycles in vc1_decode_p_mb, 2029 runs, 19 skips
13081 decicycles in vc1_decode_intra_block, 8183 runs, 9 skips
98449 decicycles in vc1_decode_p_mb, 4067 runs, 29 skips
12712 decicycles in vc1_decode_p_block, 8183 runs, 9 skips
10512 decicycles in vc1_decode_intra_block, 16370 runs, 14 skips
80419 decicycles in vc1_decode_p_mb, 8146 runs, 46 skips
11303 decicycles in vc1_decode_p_block, 16369 runs, 15 skips
8666 decicycles in vc1_decode_intra_block, 32740 runs, 28 skips
68238 decicycles in vc1_decode_p_mb, 16313 runs, 71 skips
10360 decicycles in vc1_decode_p_block, 32743 runs, 25 skips
8093 decicycles in vc1_decode_intra_block, 65467 runs, 69 skips
66530 decicycles in vc1_decode_p_mb, 32621 runs, 147 skips

after:
92560 decicycles in vc1_decode_i_block_adv, 1 runs, 0 skips
91300 decicycles in vc1_decode_i_block_adv, 2 runs, 0 skips
67220 decicycles in vc1_decode_i_block_adv, 4 runs, 0 skips
49370 decicycles in vc1_decode_i_block_adv, 8 runs, 0 skips
33667 decicycles in vc1_decode_i_block_adv, 16 runs, 0 skips
23667 decicycles in vc1_decode_i_block_adv, 32 runs, 0 skips
18514 decicycles in vc1_decode_i_block_adv, 64 runs, 0 skips
16161 decicycles in vc1_decode_i_block_adv, 128 runs, 0 skips
13475 decicycles in vc1_decode_i_block_adv, 256 runs, 0 skips
12499 decicycles in vc1_decode_i_block_adv, 512 runs, 0 skips
11917 decicycles in vc1_decode_i_block_adv, 1023 runs, 1 skips
11256 decicycles in vc1_decode_i_block_adv, 2046 runs, 2 skips
10976 decicycles in vc1_decode_i_block_adv, 4092 runs, 4 skips
90360 decicycles in vc1_decode_p_block, 1 runs, 0 skips
72720 decicycles in vc1_decode_p_block, 2 runs, 0 skips
60630 decicycles in vc1_decode_p_block, 4 runs, 0 skips
9561080 decicycles in vc1_decode_p_mb, 1 runs, 0 skips
134360 decicycles in vc1_decode_intra_block, 1 runs, 0 skips
91580 decicycles in vc1_decode_intra_block, 2 runs, 0 skips
60960 decicycles in vc1_decode_intra_block, 4 runs, 0 skips
8349620 decicycles in vc1_decode_p_mb, 2 runs, 0 skips
40990 decicycles in vc1_decode_p_block, 8 runs, 0 skips
42555 decicycles in vc1_decode_intra_block, 8 runs, 0 skips
5928030 decicycles in vc1_decode_p_mb, 4 runs, 0 skips
26845 decicycles in vc1_decode_p_block, 16 runs, 0 skips
30737 decicycles in vc1_decode_intra_block, 16 runs, 0 skips
3525450 decicycles in vc1_decode_p_mb, 8 runs, 0 skips
21793 decicycles in vc1_decode_p_block, 32 runs, 0 skips
23512 decicycles in vc1_decode_intra_block, 32 runs, 0 skips
2103475 decicycles in vc1_decode_p_mb, 16 runs, 0 skips
20467 decicycles in vc1_decode_intra_block, 64 runs, 0 skips
16995 decicycles in vc1_decode_p_block, 64 runs, 0 skips
1230400 decicycles in vc1_decode_p_mb, 32 runs, 0 skips
16564 decicycles in vc1_decode_intra_block, 128 runs, 0 skips
13764 decicycles in vc1_decode_p_block, 128 runs, 0 skips
750279 decicycles in vc1_decode_p_mb, 64 runs, 0 skips
11652 decicycles in vc1_decode_p_block, 256 runs, 0 skips
15900 decicycles in vc1_decode_intra_block, 256 runs, 0 skips
456913 decicycles in vc1_decode_p_mb, 128 runs, 0 skips
10833 decicycles in vc1_decode_p_block, 512 runs, 0 skips
14797 decicycles in vc1_decode_intra_block, 512 runs, 0 skips
291080 decicycles in vc1_decode_p_mb, 256 runs, 0 skips
10124 decicycles in vc1_decode_p_block, 1024 runs, 0 skips
13775 decicycles in vc1_decode_intra_block, 1023 runs, 1 skips
190311 decicycles in vc1_decode_p_mb, 510 runs, 2 skips
10360 decicycles in vc1_decode_p_block, 2046 runs, 2 skips
13429 decicycles in vc1_decode_intra_block, 2046 runs, 2 skips
144685 decicycles in vc1_decode_p_mb, 1020 runs, 4 skips
10555 decicycles in vc1_decode_p_block, 4092 runs, 4 skips
11877 decicycles in vc1_decode_intra_block, 4089 runs, 7 skips
105510 decicycles in vc1_decode_p_mb, 2036 runs, 12 skips
9920 decicycles in vc1_decode_intra_block, 8181 runs, 11 skips
78771 decicycles in vc1_decode_p_mb, 4075 runs, 21 skips
9712 decicycles in vc1_decode_p_block, 8185 runs, 7 skips
9013 decicycles in vc1_decode_intra_block, 16368 runs, 16 skips
70765 decicycles in vc1_decode_p_mb, 8154 runs, 38 skips
9780 decicycles in vc1_decode_p_block, 16367 runs, 17 skips
7972 decicycles in vc1_decode_intra_block, 32741 runs, 27 skips
63213 decicycles in vc1_decode_p_mb, 16319 runs, 65 skips
9337 decicycles in vc1_decode_p_block, 32738 runs, 30 skips
7542 decicycles in vc1_decode_intra_block, 65486 runs, 50 skips
60802 decicycles in vc1_decode_p_mb, 32639 runs, 129 skips

Signed-off-by: Zeng Zhaoxiu <zhaoxiu.zeng at gmail.com>
---
 libavcodec/vc1_block.c | 1689 ++++++++++++++++++++++++------------------------
 1 file changed, 846 insertions(+), 843 deletions(-)

diff --git a/libavcodec/vc1_block.c b/libavcodec/vc1_block.c
index aa62ec2..1c0141e 100644
--- a/libavcodec/vc1_block.c
+++ b/libavcodec/vc1_block.c
@@ -40,8 +40,10 @@
 #define DC_VLC_BITS 9
 
 // offset tables for interlaced picture MVDATA decoding
-static const int offset_table1[9] = {  0,  1,  2,  4,  8, 16, 32,  64, 128 };
-static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
+static const uint8_t offset_table[2][9] = {
+    {  0,  1,  2,  4,  8, 16, 32,  64, 128 },
+    {  0,  1,  3,  7, 15, 31, 63, 127, 255 },
+};
 
 /***********************************************************************/
 /**
@@ -51,7 +53,7 @@ static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
  */
 
 
-static void init_block_index(VC1Context *v)
+static inline void init_block_index(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     ff_init_block_index(s);
@@ -64,12 +66,9 @@ static void init_block_index(VC1Context *v)
 
 /** @} */ //Bitplane group
 
-static void vc1_put_signed_blocks_clamped(VC1Context *v)
+static void vc1_put_signed_blocks_clamped(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
-    int topleft_mb_pos, top_mb_pos;
-    int stride_y, fieldtx = 0;
-    int v_dist;
 
     /* The put pixels loop is always one MB row behind the decoding loop,
      * because we can only put pixels when overlap filtering is done, and
@@ -79,12 +78,16 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
      * decoding loop. The reason for this is again, because for filtering
      * of the right MB edge, we need the next MB present. */
     if (!s->first_slice_line) {
+        int stride_y, fieldtx = 0;
+        int v_dist;
+
         if (s->mb_x) {
-            topleft_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x - 1;
-            if (v->fcm == ILACE_FRAME)
+            if (v->fcm == ILACE_FRAME) {
+                int topleft_mb_pos = mb_pos - s->mb_stride - 1;
                 fieldtx = v->fieldtx_plane[topleft_mb_pos];
+            }
             stride_y       = s->linesize << fieldtx;
-            v_dist         = (16 - fieldtx) >> (fieldtx == 0);
+            v_dist         = fieldtx ? 15 : 8;
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
                                               s->dest[0] - 16 * s->linesize - 16,
                                               stride_y);
@@ -105,9 +108,10 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
                                               s->uvlinesize);
         }
         if (s->mb_x == s->mb_width - 1) {
-            top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
-            if (v->fcm == ILACE_FRAME)
+            if (v->fcm == ILACE_FRAME) {
+                int top_mb_pos = mb_pos - s->mb_stride;
                 fieldtx = v->fieldtx_plane[top_mb_pos];
+            }
             stride_y   = s->linesize << fieldtx;
             v_dist     = fieldtx ? 15 : 8;
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
@@ -156,7 +160,6 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
  */
 #define GET_MQUANT()                                           \
     if (v->dquantfrm) {                                        \
-        int edges = 0;                                         \
         if (v->dqprofile == DQPROFILE_ALL_MBS) {               \
             if (v->dqbilevel) {                                \
                 mquant = (get_bits1(gb)) ? v->altpq : v->pq;   \
@@ -167,21 +170,20 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
                 else                                           \
                     mquant = get_bits(gb, 5);                  \
             }                                                  \
+        } else {                                               \
+            int edges;                                         \
+            if (v->dqprofile == DQPROFILE_SINGLE_EDGE)         \
+                edges = 1 << v->dqsbedge;                      \
+            else if (v->dqprofile == DQPROFILE_DOUBLE_EDGES)   \
+                edges = (3 << v->dqsbedge) % 15;               \
+            else /*if (v->dqprofile == DQPROFILE_FOUR_EDGES)*/ \
+                edges = 15;                                    \
+            if (((edges & 1) && !s->mb_x) ||                   \
+                ((edges & 2) && s->first_slice_line) ||        \
+                ((edges & 4) && s->mb_x == (s->mb_width - 1)) || \
+                ((edges & 8) && s->mb_y == (s->mb_height - 1)))  \
+                mquant = v->altpq;                             \
         }                                                      \
-        if (v->dqprofile == DQPROFILE_SINGLE_EDGE)             \
-            edges = 1 << v->dqsbedge;                          \
-        else if (v->dqprofile == DQPROFILE_DOUBLE_EDGES)       \
-            edges = (3 << v->dqsbedge) % 15;                   \
-        else if (v->dqprofile == DQPROFILE_FOUR_EDGES)         \
-            edges = 15;                                        \
-        if ((edges&1) && !s->mb_x)                             \
-            mquant = v->altpq;                                 \
-        if ((edges&2) && s->first_slice_line)                  \
-            mquant = v->altpq;                                 \
-        if ((edges&4) && s->mb_x == (s->mb_width - 1))         \
-            mquant = v->altpq;                                 \
-        if ((edges&8) && s->mb_y == (s->mb_height - 1))        \
-            mquant = v->altpq;                                 \
         if (!mquant || mquant > 31) {                          \
             av_log(v->s.avctx, AV_LOG_ERROR,                   \
                    "Overriding invalid mquant %d\n", mquant);  \
@@ -199,11 +201,9 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
 #define GET_MVDATA(_dmv_x, _dmv_y)                                      \
     index = 1 + get_vlc2(gb, ff_vc1_mv_diff_vlc[s->mv_table_index].table, \
                          VC1_MV_DIFF_VLC_BITS, 2);                      \
-    if (index > 36) {                                                   \
-        mb_has_coeffs = 1;                                              \
+    mb_has_coeffs = (index >= 37);                                      \
+    if (mb_has_coeffs)                                                  \
         index -= 37;                                                    \
-    } else                                                              \
-        mb_has_coeffs = 0;                                              \
     s->mb_intra = 0;                                                    \
     if (!index) {                                                       \
         _dmv_x = _dmv_y = 0;                                            \
@@ -216,33 +216,32 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         s->mb_intra = 1;                                                \
     } else {                                                            \
         index1 = index % 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val&1);                                             \
-        _dmv_x = (sign ^ ((val>>1) + offset_table[index1])) - sign;     \
+        _dmv_x = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_x = (sign ^ ((val >> 1) + _dmv_x)) - sign;             \
+        }                                                               \
                                                                         \
         index1 = index / 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val & 1);                                           \
-        _dmv_y = (sign ^ ((val >> 1) + offset_table[index1])) - sign;   \
+        _dmv_y = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_y = (sign ^ ((val >> 1) + _dmv_y)) - sign;             \
+        }                                                               \
     }
 
 static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
                                                    int *dmv_y, int *pred_flag)
 {
     int index, index1;
-    int extend_x = 0, extend_y = 0;
+    int extend_x, extend_y;
     GetBitContext *gb = &v->s.gb;
     int bits, esc;
     int val, sign;
-    const int* offs_tab;
 
     if (v->numref) {
         bits = VC1_2REF_MVDATA_VLC_BITS;
@@ -251,52 +250,32 @@ static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
         bits = VC1_1REF_MVDATA_VLC_BITS;
         esc  = 71;
     }
-    switch (v->dmvrange) {
-    case 1:
-        extend_x = 1;
-        break;
-    case 2:
-        extend_y = 1;
-        break;
-    case 3:
-        extend_x = extend_y = 1;
-        break;
-    }
+    extend_x = v->dmvrange & 1;
+    extend_y = (v->dmvrange >> 1) & 1;
     index = get_vlc2(gb, v->imv_vlc->table, bits, 3);
     if (index == esc) {
         *dmv_x = get_bits(gb, v->k_x);
         *dmv_y = get_bits(gb, v->k_y);
         if (v->numref) {
-            if (pred_flag) {
+            if (pred_flag)
                 *pred_flag = *dmv_y & 1;
-                *dmv_y     = (*dmv_y + *pred_flag) >> 1;
-            } else {
-                *dmv_y     = (*dmv_y + (*dmv_y & 1)) >> 1;
-            }
+            *dmv_y = (*dmv_y + (*dmv_y & 1)) >> 1;
         }
     }
     else {
         av_assert0(index < esc);
-        if (extend_x)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
         index1 = (index + 1) % 9;
         if (index1 != 0) {
             val    = get_bits(gb, index1 + extend_x);
-            sign   = 0 -(val & 1);
-            *dmv_x = (sign ^ ((val >> 1) + offs_tab[index1])) - sign;
+            sign   = 0 - (val & 1);
+            *dmv_x = (sign ^ ((val >> 1) + offset_table[extend_x][index1])) - sign;
         } else
             *dmv_x = 0;
-        if (extend_y)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
         index1 = (index + 1) / 9;
         if (index1 > v->numref) {
-            val    = get_bits(gb, (index1 + (extend_y << v->numref)) >> v->numref);
+            val    = get_bits(gb, (index1 >> v->numref) + extend_y);
             sign   = 0 - (val & 1);
-            *dmv_y = (sign ^ ((val >> 1) + offs_tab[index1 >> v->numref])) - sign;
+            *dmv_y = (sign ^ ((val >> 1) + offset_table[extend_y][index1 >> v->numref])) - sign;
         } else
             *dmv_y = 0;
         if (v->numref && pred_flag)
@@ -345,8 +324,10 @@ static inline int vc1_i_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     };
 
     /* find prediction - wmv3_dc_scale always used here in fact */
-    if (n < 4) scale = s->y_dc_scale;
-    else       scale = s->c_dc_scale;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
 
     wrap   = s->block_wrap[n];
     dc_val = s->dc_val[0] + s->block_index[n];
@@ -399,14 +380,19 @@ static inline int vc1_i_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
  */
 static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
                               int a_avail, int c_avail,
-                              int16_t **dc_val_ptr, int *dir_ptr)
+                              int16_t **dc_val_ptr, int *dir_ptr, int mb_pos)
 {
     int a, b, c, wrap, pred;
     int16_t *dc_val;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
-    int q1, q2 = 0;
+    int q1, q2;
     int dqscale_index;
 
+    /* scale predictors if needed */
+    q1 = s->current_picture.qscale_table[mb_pos];
+    dqscale_index = s->y_dc_scale_table[q1] - 1;
+    if (dqscale_index < 0)
+        return 0;
+
     wrap = s->block_wrap[n];
     dc_val = s->dc_val[0] + s->block_index[n];
 
@@ -416,11 +402,7 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     c = dc_val[ - 1];
     b = dc_val[ - 1 - wrap];
     a = dc_val[ - wrap];
-    /* scale predictors if needed */
-    q1 = s->current_picture.qscale_table[mb_pos];
-    dqscale_index = s->y_dc_scale_table[q1] - 1;
-    if (dqscale_index < 0)
-        return 0;
+
     if (c_avail && (n != 1 && n != 3)) {
         q2 = s->current_picture.qscale_table[mb_pos - 1];
         if (q2 && q2 != q1)
@@ -442,20 +424,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
             b = (b * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
 
-    if (a_avail && c_avail) {
-        if (abs(a - b) <= abs(b - c)) {
-            pred     = c;
-            *dir_ptr = 1; // left
-        } else {
-            pred     = a;
-            *dir_ptr = 0; // top
-        }
+    if (c_avail && (!a_avail || abs(a - b) <= abs(b - c))) {
+        pred     = c;
+        *dir_ptr = 1; // left
     } else if (a_avail) {
         pred     = a;
         *dir_ptr = 0; // top
-    } else if (c_avail) {
-        pred     = c;
-        *dir_ptr = 1; // left
     } else {
         pred     = 0;
         *dir_ptr = 1; // left
@@ -514,17 +488,16 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                                 int *value, int codingset)
 {
     GetBitContext *gb = &v->s.gb;
-    int index, escape, run = 0, level = 0, lst = 0;
+    int index, run, level, lst, sign;
 
     index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
     if (index != ff_vc1_ac_sizes[codingset] - 1) {
         run   = vc1_index_decode_table[codingset][index][0];
         level = vc1_index_decode_table[codingset][index][1];
         lst   = index >= vc1_last_decode_table[codingset] || get_bits_left(gb) < 0;
-        if (get_bits1(gb))
-            level = -level;
+        sign  = get_bits1(gb);
     } else {
-        escape = decode210(gb);
+        int escape = decode210(gb);
         if (escape != 2) {
             index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
             run   = vc1_index_decode_table[codingset][index][0];
@@ -541,10 +514,8 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                 else
                     run += vc1_delta_run_table[codingset][level] + 1;
             }
-            if (get_bits1(gb))
-                level = -level;
+            sign = get_bits1(gb);
         } else {
-            int sign;
             lst = get_bits1(gb);
             if (v->s.esc3_level_length == 0) {
                 if (v->pq < 8 || v->dquantfrm) { // table 59
@@ -559,14 +530,12 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
             run   = get_bits(gb, v->s.esc3_run_length);
             sign  = get_bits1(gb);
             level = get_bits(gb, v->s.esc3_level_length);
-            if (sign)
-                level = -level;
         }
     }
 
     *last  = lst;
     *skip  = run;
-    *value = level;
+    *value = (level ^ -sign) + sign;
 }
 
 /** Decode intra block in intra frames - should be faster than decode_intra_block
@@ -585,29 +554,24 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     int i;
     int16_t *dc_val;
     int16_t *ac_val, *ac_val2;
-    int dcdiff;
+    int dcdiff, scale;
 
     /* Get DC differential */
-    if (n < 4) {
+    if (n < 4)
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_luma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    } else {
+    else
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_chroma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    }
     if (dcdiff < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Illegal DC VLC\n");
         return -1;
     }
     if (dcdiff) {
+        const int m = (v->pq == 1 || v->pq == 2) ? 3 - v->pq : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (v->pq == 1)      dcdiff = get_bits(gb, 10);
-            else if (v->pq == 2) dcdiff = get_bits(gb, 9);
-            else                 dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (v->pq == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (v->pq == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -618,27 +582,29 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-    /* Skip ? */
-    if (!coded) {
-        goto not_coded;
-    }
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
-    // AC Decoding
-    i = 1;
+    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val2 = ac_val;
+    if (dc_pred_dir) // left
+        ac_val -= 16;
+    else // top
+        ac_val -= 16 * s->block_wrap[n];
+
+    scale = v->pq * 2 + v->halfpq;
+
+    //AC Decoding
+    i = !!coded;
 
-    {
+    if (coded) {
         int last = 0, skip, value;
         const uint8_t *zz_table;
-        int scale;
         int k;
 
-        scale = v->pq * 2 + v->halfpq;
-
         if (v->s.ac_pred) {
             if (!dc_pred_dir)
                 zz_table = v->zz_8x8[2];
@@ -647,13 +613,6 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         } else
             zz_table = v->zz_8x8[1];
 
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
-        if (dc_pred_dir) // left
-            ac_val -= 16;
-        else // top
-            ac_val -= 16 * s->block_wrap[n];
-
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, codingset);
             i += skip;
@@ -664,13 +623,15 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++)
-                    block[k << v->left_blk_sh] += ac_val[k];
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++)
-                    block[k << v->top_blk_sh] += ac_val[k + 8];
+                sh = v->top_blk_sh;
+                ac_val += 8;
             }
+            for (k = 1; k < 8; k++)
+                block[k << sh] += ac_val[k];
         }
         /* save AC coeffs for further prediction */
         for (k = 1; k < 8; k++) {
@@ -686,46 +647,30 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -v->pq : v->pq;
             }
 
-        if (s->ac_pred) i = 63;
-    }
-
-not_coded:
-    if (!coded) {
-        int k, scale;
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
+    } else {
+        int k;
 
-        i = 0;
-        scale = v->pq * 2 + v->halfpq;
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            ac_val -= 16;
-            if (s->ac_pred)
-                memcpy(ac_val2, ac_val, 8 * 2);
-        } else { // top
-            ac_val -= 16 * s->block_wrap[n];
-            if (s->ac_pred)
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-        }
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { //left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -v->pq : v->pq;
             }
-            i = 63;
         }
     }
+    if (s->ac_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -740,7 +685,7 @@ not_coded:
  * @param mquant quantizer value for this macroblock
  */
 static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
-                                  int coded, int codingset, int mquant)
+                                  int coded, int codingset, int mquant, int mb_pos)
 {
     GetBitContext *gb = &v->s.gb;
     MpegEncContext *s = &v->s;
@@ -753,72 +698,69 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     int use_pred = s->ac_pred;
     int scale;
     int q1, q2 = 0;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
     /* Get DC differential */
-    if (n < 4) {
+    if (n < 4)
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_luma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    } else {
+    else
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_chroma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    }
     if (dcdiff < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Illegal DC VLC\n");
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
     }
 
     /* Prediction */
-    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, v->a_avail, v->c_avail, &dc_val, &dc_pred_dir);
+    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, v->a_avail, v->c_avail, &dc_val, &dc_pred_dir, mb_pos);
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-
-    //AC Decoding
-    i = 1;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
     /* check if AC is needed at all */
     if (!a_avail && !c_avail)
         use_pred = 0;
+
     ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
     ac_val2 = ac_val;
-
-    scale = mquant * 2 + ((mquant == v->pq) ? v->halfpq : 0);
-
     if (dc_pred_dir) // left
         ac_val -= 16;
     else // top
         ac_val -= 16 * s->block_wrap[n];
 
+    scale = mquant * 2 + ((mquant == v->pq) ? v->halfpq : 0);
+
     q1 = s->current_picture.qscale_table[mb_pos];
-    if ( dc_pred_dir && c_avail && mb_pos)
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
-    if (!dc_pred_dir && a_avail && mb_pos >= s->mb_stride)
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
-    if ( dc_pred_dir && n == 1)
-        q2 = q1;
-    if (!dc_pred_dir && n == 2)
-        q2 = q1;
     if (n == 3)
         q2 = q1;
+    else if (dc_pred_dir) {
+        if (n == 1)
+            q2 = q1;
+        else if (c_avail && mb_pos)
+            q2 = s->current_picture.qscale_table[mb_pos - 1];
+    } else {
+        if (n == 2)
+            q2 = q1;
+        else if (a_avail && mb_pos >= s->mb_stride)
+            q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+    }
+
+    //AC Decoding
+    i = 1;
 
     if (coded) {
         int last = 0, skip, value;
@@ -851,28 +793,24 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
+            if (dc_pred_dir) { // left
+                sh = v->left_blk_sh;
+            } else { // top
+                sh = v->top_blk_sh;
+                ac_val += 8;
+            }
             /* scale predictors if needed*/
             if (q2 && q1 != q2) {
                 q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
                 if (q1 < 1)
                     return AVERROR_INVALIDDATA;
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                } else { // top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += (ac_val[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
             } else {
-                if (dc_pred_dir) { //left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += ac_val[k];
-                } else { //top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += ac_val[k + 8];
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += ac_val[k];
             }
         }
         /* save AC coeffs for further prediction */
@@ -889,55 +827,38 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -mquant : mquant;
             }
 
-        if (use_pred) i = 63;
     } else { // no AC coeffs
         int k;
 
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            if (use_pred) {
-                memcpy(ac_val2, ac_val, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        } else { // top
-            if (use_pred) {
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val2[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            if (q2 && q1 != q2) {
+                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                for (k = 1; k < 8; k++)
+                    ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
+            }
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val2[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -mquant : mquant;
             }
-            i = 63;
         }
     }
+    if (use_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -952,7 +873,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
  * @param codingset set of VLC to decode data
  */
 static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
-                                  int coded, int mquant, int codingset)
+                                  int coded, int mquant, int codingset, int mb_pos)
 {
     GetBitContext *gb = &v->s.gb;
     MpegEncContext *s = &v->s;
@@ -961,7 +882,6 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int a_avail = v->a_avail, c_avail = v->c_avail;
     int use_pred = s->ac_pred;
     int scale;
@@ -970,125 +890,122 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     s->bdsp.clear_block(block);
 
     /* XXX: Guard against dumb values of mquant */
-    mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant);
+    mquant = av_clip(mquant, 0, 31);
 
     /* Set DC scale - y and c use the same */
     s->y_dc_scale = s->y_dc_scale_table[mquant];
     s->c_dc_scale = s->c_dc_scale_table[mquant];
 
     /* Get DC differential */
-    if (n < 4) {
+    if (n < 4)
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_luma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    } else {
+    else
         dcdiff = get_vlc2(&s->gb, ff_msmp4_dc_chroma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
-    }
     if (dcdiff < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Illegal DC VLC\n");
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
     }
 
     /* Prediction */
-    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, a_avail, c_avail, &dc_val, &dc_pred_dir);
+    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, a_avail, c_avail, &dc_val, &dc_pred_dir, mb_pos);
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-
-    //AC Decoding
-    i = 1;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
     /* check if AC is needed at all and adjust direction if needed */
     if (!a_avail) dc_pred_dir = 1;
     if (!c_avail) dc_pred_dir = 0;
     if (!a_avail && !c_avail) use_pred = 0;
+
     ac_val = s->ac_val[0][0] + s->block_index[n] * 16;
     ac_val2 = ac_val;
-
-    scale = mquant * 2 + v->halfpq;
-
     if (dc_pred_dir) //left
         ac_val -= 16;
     else //top
         ac_val -= 16 * s->block_wrap[n];
 
+    scale = mquant * 2 + v->halfpq;
+
     q1 = s->current_picture.qscale_table[mb_pos];
-    if (dc_pred_dir && c_avail && mb_pos)
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
-    if (!dc_pred_dir && a_avail && mb_pos >= s->mb_stride)
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
-    if ( dc_pred_dir && n == 1)
-        q2 = q1;
-    if (!dc_pred_dir && n == 2)
+    if (n == 3)
         q2 = q1;
-    if (n == 3) q2 = q1;
+    else if (dc_pred_dir) {
+        if (n == 1)
+            q2 = q1;
+        else if (c_avail && mb_pos)
+            q2 = s->current_picture.qscale_table[mb_pos - 1];
+    } else {
+        if (n == 2)
+            q2 = q1;
+        else if (a_avail && mb_pos >= s->mb_stride)
+            q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+    }
+
+    //AC Decoding
+    i = 1;
 
     if (coded) {
         int last = 0, skip, value;
+        const uint8_t *zz_table;
         int k;
 
+        if (v->fcm == PROGRESSIVE)
+            zz_table = v->zz_8x8[0];
+        else {
+            if (use_pred && (v->fcm == ILACE_FRAME)) {
+                if (!dc_pred_dir) // top
+                    zz_table = v->zz_8x8[2];
+                else // left
+                    zz_table = v->zz_8x8[3];
+            } else {
+                zz_table = v->zzi_8x8;
+            }
+        }
+
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, codingset);
             i += skip;
             if (i > 63)
                 break;
-            if (v->fcm == PROGRESSIVE)
-                block[v->zz_8x8[0][i++]] = value;
-            else {
-                if (use_pred && (v->fcm == ILACE_FRAME)) {
-                    if (!dc_pred_dir) // top
-                        block[v->zz_8x8[2][i++]] = value;
-                    else // left
-                        block[v->zz_8x8[3][i++]] = value;
-                } else {
-                    block[v->zzi_8x8[i++]] = value;
-                }
-            }
+            block[zz_table[i++]] = value;
         }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
+            if (dc_pred_dir) { // left
+                sh = v->left_blk_sh;
+            } else { //top
+                sh = v->top_blk_sh;
+                ac_val += 8;
+            }
             /* scale predictors if needed*/
             if (q2 && q1 != q2) {
                 q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
                 if (q1 < 1)
                     return AVERROR_INVALIDDATA;
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                } else { //top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += (ac_val[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
             } else {
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += ac_val[k];
-                } else { // top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += ac_val[k + 8];
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += ac_val[k];
             }
         }
         /* save AC coeffs for further prediction */
@@ -1105,55 +1022,38 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -mquant : mquant;
             }
 
-        if (use_pred) i = 63;
     } else { // no AC coeffs
         int k;
 
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            if (use_pred) {
-                memcpy(ac_val2, ac_val, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        } else { // top
-            if (use_pred) {
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val2[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            if (q2 && q1 != q2) {
+                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                for (k = 1; k < 8; k++)
+                    ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
+            }
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val2[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -mquant : mquant;
             }
-            i = 63;
         }
     }
+    if (use_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -1173,6 +1073,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
     int scale, off, idx, last, skip, value;
     int ttblk = ttmb & 7;
     int pat = 0;
+    const uint8_t *zz_table;
 
     s->bdsp.clear_block(block);
 
@@ -1190,34 +1091,33 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
             subblkpat ^= 3; // swap decoded pattern bits
         if (ttblk == TT_8X4_TOP || ttblk == TT_8X4_BOTTOM)
             ttblk = TT_8X4;
-        if (ttblk == TT_4X8_RIGHT || ttblk == TT_4X8_LEFT)
+        else if (ttblk == TT_4X8_RIGHT || ttblk == TT_4X8_LEFT)
             ttblk = TT_4X8;
     }
-    scale = 2 * mquant + ((v->pq == mquant) ? v->halfpq : 0);
-
     // convert transforms like 8X4_TOP to generic TT and SUBBLKPAT
-    if (ttblk == TT_8X4_TOP || ttblk == TT_8X4_BOTTOM) {
+    else if (ttblk == TT_8X4_TOP || ttblk == TT_8X4_BOTTOM) {
         subblkpat = 2 - (ttblk == TT_8X4_TOP);
         ttblk     = TT_8X4;
     }
-    if (ttblk == TT_4X8_RIGHT || ttblk == TT_4X8_LEFT) {
+    else if (ttblk == TT_4X8_RIGHT || ttblk == TT_4X8_LEFT) {
         subblkpat = 2 - (ttblk == TT_4X8_LEFT);
         ttblk     = TT_4X8;
     }
+
+    scale = 2 * mquant + ((v->pq == mquant) ? v->halfpq : 0);
+
     switch (ttblk) {
     case TT_8X8:
         pat  = 0xF;
         i    = 0;
         last = 0;
+        zz_table = !v->fcm ? v->zz_8x8[0] : v->zzi_8x8;
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, v->codingset2);
             i += skip;
             if (i > 63)
                 break;
-            if (!v->fcm)
-                idx = v->zz_8x8[0][i++];
-            else
-                idx = v->zzi_8x8[i++];
+            idx = zz_table[i++];
             block[idx] = value * scale;
             if (!v->pquantizer)
                 block[idx] += (block[idx] < 0) ? -mquant : mquant;
@@ -1234,34 +1134,33 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
     case TT_4X4:
         pat = ~subblkpat & 0xF;
         for (j = 0; j < 4; j++) {
-            last = subblkpat & (1 << (3 - j));
+            last = subblkpat & (8 >> j);
             i    = 0;
             off  = (j & 1) * 4 + (j & 2) * 16;
+            zz_table = !v->fcm ? ff_vc1_simple_progressive_4x4_zz : ff_vc1_adv_interlaced_4x4_zz;
             while (!last) {
                 vc1_decode_ac_coeff(v, &last, &skip, &value, v->codingset2);
                 i += skip;
                 if (i > 15)
                     break;
-                if (!v->fcm)
-                    idx = ff_vc1_simple_progressive_4x4_zz[i++];
-                else
-                    idx = ff_vc1_adv_interlaced_4x4_zz[i++];
+                idx = zz_table[i++];
                 block[idx + off] = value * scale;
                 if (!v->pquantizer)
                     block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant;
             }
-            if (!(subblkpat & (1 << (3 - j))) && !skip_block) {
+            if (!(subblkpat & (8 >> j)) && !skip_block) {
                 if (i == 1)
-                    v->vc1dsp.vc1_inv_trans_4x4_dc(dst + (j & 1) * 4 + (j & 2) * 2 * linesize, linesize, block + off);
+                    v->vc1dsp.vc1_inv_trans_4x4_dc(dst + ((j & 1) + (j & 2 ? linesize : 0)) * 4, linesize, block + off);
                 else
-                    v->vc1dsp.vc1_inv_trans_4x4(dst + (j & 1) * 4 + (j & 2) *  2 * linesize, linesize, block + off);
+                    v->vc1dsp.vc1_inv_trans_4x4(dst + ((j & 1) + (j & 2 ? linesize : 0)) * 4, linesize, block + off);
             }
         }
         break;
     case TT_8X4:
         pat = ~((subblkpat & 2) * 6 + (subblkpat & 1) * 3) & 0xF;
+        zz_table = !v->fcm ? v->zz_8x4 : ff_vc1_adv_interlaced_8x4_zz;
         for (j = 0; j < 2; j++) {
-            last = subblkpat & (1 << (1 - j));
+            last = subblkpat & (2 >> j);
             i    = 0;
             off  = j * 32;
             while (!last) {
@@ -1269,15 +1168,12 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                 i += skip;
                 if (i > 31)
                     break;
-                if (!v->fcm)
-                    idx = v->zz_8x4[i++] + off;
-                else
-                    idx = ff_vc1_adv_interlaced_8x4_zz[i++] + off;
+                idx = zz_table[i++] + off;
                 block[idx] = value * scale;
                 if (!v->pquantizer)
                     block[idx] += (block[idx] < 0) ? -mquant : mquant;
             }
-            if (!(subblkpat & (1 << (1 - j))) && !skip_block) {
+            if (!(subblkpat & (2 >> j)) && !skip_block) {
                 if (i == 1)
                     v->vc1dsp.vc1_inv_trans_8x4_dc(dst + j * 4 * linesize, linesize, block + off);
                 else
@@ -1287,8 +1183,9 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
         break;
     case TT_4X8:
         pat = ~(subblkpat * 5) & 0xF;
+        zz_table = !v->fcm ? v->zz_4x8 : ff_vc1_adv_interlaced_4x8_zz;
         for (j = 0; j < 2; j++) {
-            last = subblkpat & (1 << (1 - j));
+            last = subblkpat & (2 >> j);
             i    = 0;
             off  = j * 4;
             while (!last) {
@@ -1296,15 +1193,12 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                 i += skip;
                 if (i > 31)
                     break;
-                if (!v->fcm)
-                    idx = v->zz_4x8[i++] + off;
-                else
-                    idx = ff_vc1_adv_interlaced_4x8_zz[i++] + off;
+                idx = zz_table[i++] + off;
                 block[idx] = value * scale;
                 if (!v->pquantizer)
                     block[idx] += (block[idx] < 0) ? -mquant : mquant;
             }
-            if (!(subblkpat & (1 << (1 - j))) && !skip_block) {
+            if (!(subblkpat & (2 >> j)) && !skip_block) {
                 if (i == 1)
                     v->vc1dsp.vc1_inv_trans_4x8_dc(dst + j * 4, linesize, block + off);
                 else
@@ -1320,17 +1214,15 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
 
 /** @} */ // Macroblock group
 
-static const int size_table  [6] = { 0, 2, 3, 4,  5,  8 };
-static const int offset_table[6] = { 0, 1, 3, 7, 15, 31 };
+static const uint8_t size_table[6] = { 0, 2, 3, 4,  5,  8 };
 
 /** Decode one P-frame MB
  */
-static int vc1_decode_p_mb(VC1Context *v)
+static int vc1_decode_p_mb(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i, j;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
@@ -1340,7 +1232,7 @@ static int vc1_decode_p_mb(VC1Context *v)
     int index, index1; /* LUT indexes */
     int val, sign; /* temp values */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int skipped, fourmv;
     int block_cbp = 0, pat, block_tt = 0, block_intra = 0;
 
@@ -1355,8 +1247,8 @@ static int vc1_decode_p_mb(VC1Context *v)
     else
         skipped = v->s.mbskip_table[mb_pos];
 
-    if (!fourmv) { /* 1MV mode */
-        if (!skipped) {
+    if (!skipped) {
+        if (!fourmv) { /* 1MV mode */
             GET_MVDATA(dmv_x, dmv_y);
 
             if (s->mb_intra) {
@@ -1367,83 +1259,97 @@ static int vc1_decode_p_mb(VC1Context *v)
             ff_vc1_pred_mv(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], 0, 0);
 
             /* FIXME Set DC val for inter block ? */
-            if (s->mb_intra && !mb_has_coeffs) {
-                GET_MQUANT();
-                s->ac_pred = get_bits1(gb);
-                cbp        = 0;
-            } else if (mb_has_coeffs) {
+            if (mb_has_coeffs) {
                 if (s->mb_intra)
                     s->ac_pred = get_bits1(gb);
                 cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
                 GET_MQUANT();
             } else {
-                mquant = v->pq;
-                cbp    = 0;
+                if (s->mb_intra) {
+                    GET_MQUANT();
+                    s->ac_pred = get_bits1(gb);
+                }
+                cbp = 0;
             }
             s->current_picture.qscale_table[mb_pos] = mquant;
 
             if (!v->ttmbf && !s->mb_intra && mb_has_coeffs)
                 ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table,
                                 VC1_TTMB_VLC_BITS, 2);
-            if (!s->mb_intra) ff_vc1_mc_1mv(v, 0);
-            dst_idx = 0;
+            if (!s->mb_intra)
+                ff_vc1_mc_1mv(v, 0);
             for (i = 0; i < 6; i++) {
                 s->dc_val[0][s->block_index[i]] = 0;
-                dst_idx += i >> 2;
                 val = ((cbp >> (5 - i)) & 1);
-                off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
                 v->mb_type[0][s->block_index[i]] = s->mb_intra;
                 if (s->mb_intra) {
                     /* check if prediction blocks A and C are available */
                     v->a_avail = v->c_avail = 0;
-                    if (i == 2 || i == 3 || !s->first_slice_line)
-                        v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-                    if (i == 1 || i == 3 || s->mb_x)
-                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+                    if (i < 4) {
+                        if (!s->first_slice_line || (i & 2))
+                            v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                        if (s->mb_x || (i & 1))
+                            v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                    vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                           (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i>3) && (s->flags & CODEC_FLAG_GRAY))
-                        continue;
+                        vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                               v->codingset, mb_pos);
+                    } else {
+                        if (!s->first_slice_line)
+                            v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                        if (s->mb_x)
+                            v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                        vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                               v->codingset2, mb_pos);
+                        if (s->flags & CODEC_FLAG_GRAY)
+                            continue;
+                    }
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[i][j] <<= 1;
-                    s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                      s->dest[dst_idx] + off,
-                                                      i & 4 ? s->uvlinesize
-                                                            : s->linesize);
-                    if (v->pq >= 9 && v->overlap) {
-                        if (v->c_avail)
-                            v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                        if (v->a_avail)
-                            v->vc1dsp.vc1_v_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
+                    if (i < 4) {
+                        off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                          s->dest[0] + off,
+                                                          s->linesize);
+                        if (v->pq >= 9 && v->overlap) {
+                            if (v->c_avail)
+                                v->vc1dsp.vc1_h_overlap(s->dest[0] + off, s->linesize);
+                            if (v->a_avail)
+                                v->vc1dsp.vc1_v_overlap(s->dest[0] + off, s->linesize);
+                        }
+                    } else {
+                        s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                          s->dest[i - 3],
+                                                          s->uvlinesize);
+                        if (v->pq >= 9 && v->overlap) {
+                            if (v->c_avail)
+                                v->vc1dsp.vc1_h_overlap(s->dest[i - 3], s->uvlinesize);
+                            if (v->a_avail)
+                                v->vc1dsp.vc1_v_overlap(s->dest[i - 3], s->uvlinesize);
+                        }
                     }
                     block_cbp   |= 0xF << (i << 2);
                     block_intra |= 1 << i;
                 } else if (val) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
-                                             s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                    if (i < 4) {
+                        off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
+                                                 s->dest[0] + off, s->linesize,
+                                                 0, &block_tt);
+                    } else {
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
+                                                 s->dest[i - 3], s->uvlinesize,
+                                                 (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                    }
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
                     first_block = 0;
                 }
             }
-        } else { // skipped
-            s->mb_intra = 0;
-            for (i = 0; i < 6; i++) {
-                v->mb_type[0][s->block_index[i]] = 0;
-                s->dc_val[0][s->block_index[i]]  = 0;
-            }
-            s->current_picture.mb_type[mb_pos]      = MB_TYPE_SKIP;
-            s->current_picture.qscale_table[mb_pos] = 0;
-            ff_vc1_pred_mv(v, 0, 0, 0, 1, v->range_x, v->range_y, v->mb_type[0], 0, 0);
-            ff_vc1_mc_1mv(v, 0);
-        }
-    } else { // 4MV mode
-        if (!skipped /* unskipped MB */) {
+        } else { // 4MV mode
             int intra_count = 0, coded_inter = 0;
             int is_intra[6], is_coded[6];
             /* Get CBPCY */
@@ -1454,7 +1360,6 @@ static int vc1_decode_p_mb(VC1Context *v)
                 s->mb_intra                     = 0;
                 if (i < 4) {
                     dmv_x = dmv_y = 0;
-                    s->mb_intra   = 0;
                     mb_has_coeffs = 0;
                     if (val) {
                         GET_MVDATA(dmv_x, dmv_y);
@@ -1465,98 +1370,126 @@ static int vc1_decode_p_mb(VC1Context *v)
                     intra_count += s->mb_intra;
                     is_intra[i]  = s->mb_intra;
                     is_coded[i]  = mb_has_coeffs;
-                }
-                if (i & 4) {
+                } else {
                     is_intra[i] = (intra_count >= 3);
                     is_coded[i] = val;
+                    if (i == 4)
+                        ff_vc1_mc_4mv_chroma(v, 0);
                 }
-                if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, 0);
                 v->mb_type[0][s->block_index[i]] = is_intra[i];
-                if (!coded_inter)
-                    coded_inter = !is_intra[i] & is_coded[i];
+                coded_inter += !is_intra[i] & is_coded[i];
             }
             // if there are no coded blocks then don't do anything more
-            dst_idx = 0;
             if (!intra_count && !coded_inter)
                 goto end;
             GET_MQUANT();
             s->current_picture.qscale_table[mb_pos] = mquant;
             /* test if block is intra and has pred */
-            {
-                int intrapred = 0;
-                for (i = 0; i < 6; i++)
-                    if (is_intra[i]) {
-                        if (((!s->first_slice_line || (i == 2 || i == 3)) && v->mb_type[0][s->block_index[i] - s->block_wrap[i]])
-                            || ((s->mb_x || (i == 1 || i == 3)) && v->mb_type[0][s->block_index[i] - 1])) {
-                            intrapred = 1;
-                            break;
-                        }
-                    }
-                if (intrapred)
-                    s->ac_pred = get_bits1(gb);
-                else
-                    s->ac_pred = 0;
-            }
+            for (i = 0; i < 6; i++)
+                if (is_intra[i]) {
+                    if (((!s->first_slice_line || (i == 2 || i == 3)) &&
+                         v->mb_type[0][s->block_index[i] - s->block_wrap[i]]) ||
+                        ((s->mb_x || (i == 1 || i == 3)) &&
+                         v->mb_type[0][s->block_index[i] - 1]))
+                        break;
+                }
+            if (i < 6)
+                s->ac_pred = get_bits1(gb);
+            else
+                s->ac_pred = 0;
             if (!v->ttmbf && coded_inter)
                 ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
             for (i = 0; i < 6; i++) {
-                dst_idx    += i >> 2;
-                off         = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
                 s->mb_intra = is_intra[i];
                 if (is_intra[i]) {
                     /* check if prediction blocks A and C are available */
                     v->a_avail = v->c_avail = 0;
-                    if (i == 2 || i == 3 || !s->first_slice_line)
-                        v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-                    if (i == 1 || i == 3 || s->mb_x)
-                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+                    if (i < 4) {
+                        if (!s->first_slice_line || (i & 2))
+                            v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                        if (s->mb_x || (i & 1))
+                            v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                    vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
-                                           (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i>3) && (s->flags & CODEC_FLAG_GRAY))
-                        continue;
+                        vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
+                                               v->codingset, mb_pos);
+                    } else {
+                        if (!s->first_slice_line)
+                            v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                        if (s->mb_x)
+                            v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                        vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
+                                               v->codingset2, mb_pos);
+                        if (s->flags & CODEC_FLAG_GRAY)
+                            continue;
+                    }
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[i][j] <<= 1;
-                    s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                      s->dest[dst_idx] + off,
-                                                      (i & 4) ? s->uvlinesize
-                                                              : s->linesize);
-                    if (v->pq >= 9 && v->overlap) {
-                        if (v->c_avail)
-                            v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                        if (v->a_avail)
-                            v->vc1dsp.vc1_v_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
+                    if (i < 4) {
+                        off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                          s->dest[0] + off,
+                                                          s->linesize);
+                        if (v->pq >= 9 && v->overlap) {
+                            if (v->c_avail)
+                                v->vc1dsp.vc1_h_overlap(s->dest[0] + off, s->linesize);
+                            if (v->a_avail)
+                                v->vc1dsp.vc1_v_overlap(s->dest[0] + off, s->linesize);
+                        }
+                    } else {
+                        s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                          s->dest[i - 3],
+                                                          s->uvlinesize);
+                        if (v->pq >= 9 && v->overlap) {
+                            if (v->c_avail)
+                                v->vc1dsp.vc1_h_overlap(s->dest[i - 3], s->uvlinesize);
+                            if (v->a_avail)
+                                v->vc1dsp.vc1_v_overlap(s->dest[i - 3], s->uvlinesize);
+                        }
                     }
                     block_cbp   |= 0xF << (i << 2);
                     block_intra |= 1 << i;
                 } else if (is_coded[i]) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                             first_block, s->dest[dst_idx] + off,
-                                             (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->flags & CODEC_FLAG_GRAY),
-                                             &block_tt);
+                    if (i < 4) {
+                        off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                                 first_block, s->dest[0] + off,
+                                                 s->linesize,
+                                                 0,
+                                                 &block_tt);
+                    } else {
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                                 first_block, s->dest[i - 3],
+                                                 s->uvlinesize,
+                                                 (s->flags & CODEC_FLAG_GRAY),
+                                                 &block_tt);
+                    }
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
                     first_block = 0;
                 }
             }
-        } else { // skipped MB
-            s->mb_intra                               = 0;
-            s->current_picture.qscale_table[mb_pos] = 0;
-            for (i = 0; i < 6; i++) {
-                v->mb_type[0][s->block_index[i]] = 0;
-                s->dc_val[0][s->block_index[i]]  = 0;
-            }
+        }
+    } else { // skipped
+        s->mb_intra = 0;
+        for (i = 0; i < 6; i++) {
+            v->mb_type[0][s->block_index[i]] = 0;
+            s->dc_val[0][s->block_index[i]]  = 0;
+        }
+        s->current_picture.qscale_table[mb_pos] = 0;
+        if (!fourmv) { /* 1MV mode */
+            s->current_picture.mb_type[mb_pos]      = MB_TYPE_SKIP;
+            ff_vc1_pred_mv(v, 0, 0, 0, 1, v->range_x, v->range_y, v->mb_type[0], 0, 0);
+            ff_vc1_mc_1mv(v, 0);
+        } else { // 4MV mode
             for (i = 0; i < 4; i++) {
                 ff_vc1_pred_mv(v, i, 0, 0, 0, v->range_x, v->range_y, v->mb_type[0], 0, 0);
                 ff_vc1_mc_4mv_luma(v, i, 0, 0);
             }
             ff_vc1_mc_4mv_chroma(v, 0);
-            s->current_picture.qscale_table[mb_pos] = 0;
         }
     }
 end:
@@ -1569,12 +1502,11 @@ end:
 
 /* Decode one macroblock in an interlaced frame p picture */
 
-static int vc1_decode_p_mb_intfr(VC1Context *v)
+static int vc1_decode_p_mb_intfr(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp = 0; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
@@ -1583,11 +1515,11 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
     int dmv_x, dmv_y; /* Differential MV components */
     int val; /* temp value */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int skipped, fourmv = 0, twomv = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0, mvbp;
-    int stride_y, fieldtx;
+    int fieldtx;
 
     mquant = v->pq; /* Lossy initialization */
 
@@ -1648,32 +1580,42 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
             /* Set DC scale - y and c use the same (not sure if necessary here) */
             s->y_dc_scale = s->y_dc_scale_table[mquant];
             s->c_dc_scale = s->c_dc_scale_table[mquant];
-            dst_idx = 0;
             for (i = 0; i < 6; i++) {
                 v->a_avail = v->c_avail          = 0;
                 v->mb_type[0][s->block_index[i]] = 1;
                 s->dc_val[0][s->block_index[i]]  = 0;
-                dst_idx += i >> 2;
                 val = ((cbp >> (5 - i)) & 1);
-                if (i == 2 || i == 3 || !s->first_slice_line)
-                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-                if (i == 1 || i == 3 || s->mb_x)
-                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+                if (i < 4) {
+                    if (!s->first_slice_line || (i & 2))
+                        v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                    if (s->mb_x || (i & 1))
+                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                    vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                           v->codingset, mb_pos);
+                } else {
+                    if (!s->first_slice_line)
+                        v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                    if (s->mb_x)
+                        v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                       (i & 4) ? v->codingset2 : v->codingset);
-                if ((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
+                    vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                           v->codingset2, mb_pos);
+                    if (s->flags & CODEC_FLAG_GRAY)
+                        continue;
+                }
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                 if (i < 4) {
-                    stride_y = s->linesize << fieldtx;
-                    off = (fieldtx) ? ((i & 1) * 8) + ((i & 2) >> 1) * s->linesize : (i & 1) * 8 + 4 * (i & 2) * s->linesize;
+                    off = fieldtx ? ((i & 1) * 8 + (i & 2 ? s->linesize : 0))
+                                  : ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                    s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                      s->dest[0] + off,
+                                                      s->linesize << fieldtx);
                 } else {
-                    stride_y = s->uvlinesize;
-                    off = 0;
+                    s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                      s->dest[i - 3],
+                                                      s->uvlinesize);
                 }
-                s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                  s->dest[dst_idx] + off,
-                                                  stride_y);
                 //TODO: loop filter
             }
 
@@ -1683,46 +1625,36 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                 cbp = 1 + get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
             if (ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][0] == MV_PMODE_INTFR_2MV_FIELD) {
                 v->twomvbp = get_vlc2(gb, v->twomvbp_vlc->table, VC1_2MV_BLOCK_PATTERN_VLC_BITS, 1);
-            } else {
-                if ((ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][0] == MV_PMODE_INTFR_4MV)
-                    || (ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][0] == MV_PMODE_INTFR_4MV_FIELD)) {
-                    v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-                }
+            } else if (ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][0] == MV_PMODE_INTFR_4MV ||
+                       ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][0] == MV_PMODE_INTFR_4MV_FIELD) {
+                v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
             }
             s->mb_intra = v->is_intra[s->mb_x] = 0;
             for (i = 0; i < 6; i++)
                 v->mb_type[0][s->block_index[i]] = 0;
             fieldtx = v->fieldtx_plane[mb_pos] = ff_vc1_mbmode_intfrp[v->fourmvswitch][idx_mbmode][1];
             /* for all motion vector read MVDATA and motion compensate each block */
-            dst_idx = 0;
             if (fourmv) {
                 mvbp = v->fourmvbp;
-                for (i = 0; i < 6; i++) {
-                    if (i < 4) {
-                        dmv_x = dmv_y = 0;
-                        val   = ((mvbp >> (3 - i)) & 1);
-                        if (val) {
-                            get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                        }
-                        ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
-                        ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                    } else if (i == 4) {
-                        ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
-                    }
+                for (i = 0; i < 4; i++) {
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & (8 >> i))
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
+                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
                 }
+                ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
             } else if (twomv) {
                 mvbp  = v->twomvbp;
                 dmv_x = dmv_y = 0;
-                if (mvbp & 2) {
+                if (mvbp & 2)
                     get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                }
                 ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], 0);
                 ff_vc1_mc_4mv_luma(v, 0, 0, 0);
                 ff_vc1_mc_4mv_luma(v, 1, 0, 0);
                 dmv_x = dmv_y = 0;
-                if (mvbp & 1) {
+                if (mvbp & 1)
                     get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                }
                 ff_vc1_pred_mv_intfr(v, 2, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], 0);
                 ff_vc1_mc_4mv_luma(v, 2, 0, 0);
                 ff_vc1_mc_4mv_luma(v, 3, 0, 0);
@@ -1743,17 +1675,22 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                 ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
             for (i = 0; i < 6; i++) {
                 s->dc_val[0][s->block_index[i]] = 0;
-                dst_idx += i >> 2;
-                val = ((cbp >> (5 - i)) & 1);
-                if (!fieldtx)
-                    off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-                else
-                    off = (i & 4) ? 0 : ((i & 1) * 8 + ((i > 1) * s->linesize));
-                if (val) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                             first_block, s->dest[dst_idx] + off,
-                                             (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                if (cbp & (32 >> i)) {
+                    if (i < 4) {
+                        if (!fieldtx)
+                            off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        else
+                            off = ((i & 1) * 8 + (i & 2 ? s->linesize : 0));
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                                 first_block, s->dest[0] + off,
+                                                 (s->linesize << fieldtx),
+                                                 0, &block_tt);
+                    } else {
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                                 first_block, s->dest[i - 3],
+                                                 s->uvlinesize,
+                                                 (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                    }
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1781,12 +1718,11 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
     return 0;
 }
 
-static int vc1_decode_p_mb_intfi(VC1Context *v)
+static int vc1_decode_p_mb_intfi(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp = 0; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
@@ -1795,7 +1731,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     int dmv_x, dmv_y; /* Differential MV components */
     int val; /* temp values */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int pred_flag = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0;
@@ -1818,56 +1754,65 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
         mb_has_coeffs = idx_mbmode & 1;
         if (mb_has_coeffs)
             cbp = 1 + get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_ICBPCY_VLC_BITS, 2);
-        dst_idx = 0;
         for (i = 0; i < 6; i++) {
             v->a_avail = v->c_avail          = 0;
             v->mb_type[0][s->block_index[i]] = 1;
             s->dc_val[0][s->block_index[i]]  = 0;
-            dst_idx += i >> 2;
             val = ((cbp >> (5 - i)) & 1);
-            if (i == 2 || i == 3 || !s->first_slice_line)
-                v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-            if (i == 1 || i == 3 || s->mb_x)
-                v->c_avail = v->mb_type[0][s->block_index[i] - 1];
-
-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
-            if ((i>3) && (s->flags & CODEC_FLAG_GRAY))
-                continue;
+            if (i < 4) {
+                if (!s->first_slice_line || (i & 2))
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x || (i & 1))
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset, mb_pos);
+            } else {
+                if (!s->first_slice_line)
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x)
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset2, mb_pos);
+                if (s->flags & CODEC_FLAG_GRAY)
+                    continue;                   
+            }
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
-            off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-            s->idsp.put_signed_pixels_clamped(s->block[i],
-                                              s->dest[dst_idx] + off,
-                                              (i & 4) ? s->uvlinesize
-                                                      : s->linesize);
+            if (i < 4) {
+                off  = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[0] + off,
+                                                  s->linesize);
+            } else {
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[i - 3],
+                                                  s->uvlinesize);
+            }
             // TODO: loop filter
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (idx_mbmode <= 5) { // 1-MV
             dmv_x = dmv_y = pred_flag = 0;
-            if (idx_mbmode & 1) {
+            if (idx_mbmode & 1)
                 get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
-            }
             ff_vc1_pred_mv(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
             ff_vc1_mc_1mv(v, 0);
             mb_has_coeffs = !(idx_mbmode & 2);
         } else { // 4-MV
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x = dmv_y = pred_flag = 0;
-                    val   = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
-                    }
-                    ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
-                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, 0);
+            for (i = 0; i < 4; i++) {
+                dmv_x = dmv_y = pred_flag = 0;
+                if (v->fourmvbp & (8 >> i))
+                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
+                ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
+                ff_vc1_mc_4mv_luma(v, i, 0, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, 0);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -1879,20 +1824,26 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
         if (!v->ttmbf && cbp) {
             ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
         }
-        dst_idx = 0;
         for (i = 0; i < 6; i++) {
             s->dc_val[0][s->block_index[i]] = 0;
-            dst_idx += i >> 2;
-            val = ((cbp >> (5 - i)) & 1);
-            off = (i & 4) ? 0 : (i & 1) * 8 + (i & 2) * 4 * s->linesize;
-            if (val) {
-                pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                         first_block, s->dest[dst_idx] + off,
-                                         (i & 4) ? s->uvlinesize : s->linesize,
-                                         (i & 4) && (s->flags & CODEC_FLAG_GRAY),
-                                         &block_tt);
+            if (cbp & (32 >> i)) {
+                if (i < 4) {
+                    off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                             first_block, s->dest[0] + off,
+                                             s->linesize,
+                                             0,
+                                             &block_tt);                    
+                } else {
+                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                             first_block, s->dest[i - 3],
+                                             s->uvlinesize,
+                                             (s->flags & CODEC_FLAG_GRAY),
+                                             &block_tt);    
+                }
                 block_cbp |= pat << (i << 2);
-                if (!v->ttmbf && ttmb < 8) ttmb = -1;
+                if (!v->ttmbf && ttmb < 8)
+                    ttmb = -1;
                 first_block = 0;
             }
         }
@@ -1904,12 +1855,11 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
 
 /** Decode one B-frame MB (in Main profile)
  */
-static void vc1_decode_b_mb(VC1Context *v)
+static void vc1_decode_b_mb(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i, j;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp = 0; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
@@ -1917,7 +1867,7 @@ static void vc1_decode_b_mb(VC1Context *v)
     int index, index1; /* LUT indexes */
     int val, sign; /* temp values */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int skipped, direct;
     int dmv_x[2], dmv_y[2];
     int bmvtype = BMV_TYPE_BACKWARD;
@@ -1982,74 +1932,93 @@ static void vc1_decode_b_mb(VC1Context *v)
         dmv_x[0] = dmv_y[0] = dmv_x[1] = dmv_y[1] = 0;
         ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
         vc1_b_mc(v, dmv_x, dmv_y, direct, bmvtype);
-    } else {
-        if (!mb_has_coeffs && !s->mb_intra) {
+    } else if (!mb_has_coeffs) {
+        if (!s->mb_intra) {
             /* no coded blocks - effectively skipped */
             ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
             vc1_b_mc(v, dmv_x, dmv_y, direct, bmvtype);
             return;
-        }
-        if (s->mb_intra && !mb_has_coeffs) {
+        } else {
             GET_MQUANT();
             s->current_picture.qscale_table[mb_pos] = mquant;
             s->ac_pred = get_bits1(gb);
             cbp = 0;
             ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
-        } else {
-            if (bmvtype == BMV_TYPE_INTERPOLATED) {
-                GET_MVDATA(dmv_x[0], dmv_y[0]);
-                if (!mb_has_coeffs) {
-                    /* interpolated skipped block */
-                    ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
-                    vc1_b_mc(v, dmv_x, dmv_y, direct, bmvtype);
-                    return;
-                }
-            }
-            ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
-            if (!s->mb_intra) {
+        }
+    } else {
+        if (bmvtype == BMV_TYPE_INTERPOLATED) {
+            GET_MVDATA(dmv_x[0], dmv_y[0]);
+            if (!mb_has_coeffs) {
+                /* interpolated skipped block */
+                ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
                 vc1_b_mc(v, dmv_x, dmv_y, direct, bmvtype);
+                return;
             }
-            if (s->mb_intra)
-                s->ac_pred = get_bits1(gb);
-            cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
-            GET_MQUANT();
-            s->current_picture.qscale_table[mb_pos] = mquant;
-            if (!v->ttmbf && !s->mb_intra && mb_has_coeffs)
-                ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
         }
+        ff_vc1_pred_b_mv(v, dmv_x, dmv_y, direct, bmvtype);
+        if (!s->mb_intra)
+            vc1_b_mc(v, dmv_x, dmv_y, direct, bmvtype);
+        else
+            s->ac_pred = get_bits1(gb);
+        cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
+        GET_MQUANT();
+        s->current_picture.qscale_table[mb_pos] = mquant;
+        if (!v->ttmbf && !s->mb_intra && mb_has_coeffs)
+            ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
     }
-    dst_idx = 0;
     for (i = 0; i < 6; i++) {
         s->dc_val[0][s->block_index[i]] = 0;
-        dst_idx += i >> 2;
         val = ((cbp >> (5 - i)) & 1);
-        off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
         v->mb_type[0][s->block_index[i]] = s->mb_intra;
         if (s->mb_intra) {
             /* check if prediction blocks A and C are available */
             v->a_avail = v->c_avail = 0;
-            if (i == 2 || i == 3 || !s->first_slice_line)
-                v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-            if (i == 1 || i == 3 || s->mb_x)
-                v->c_avail = v->mb_type[0][s->block_index[i] - 1];
-
-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
-            if ((i>3) && (s->flags & CODEC_FLAG_GRAY))
-                continue;
+            if (i < 4) {
+                if (!s->first_slice_line || (i & 2))
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x || (i & 1))
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset, mb_pos);
+            } else {
+                if (!s->first_slice_line)
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x)
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset2, mb_pos);
+                if (s->flags & CODEC_FLAG_GRAY)
+                    continue;
+            }
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
                 for (j = 0; j < 64; j++)
                     s->block[i][j] <<= 1;
-            s->idsp.put_signed_pixels_clamped(s->block[i],
-                                              s->dest[dst_idx] + off,
-                                              i & 4 ? s->uvlinesize
-                                                    : s->linesize);
+            if (i < 4) {
+                off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[0] + off,
+                                                  s->linesize);
+            } else {
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[i - 3],
+                                                  s->uvlinesize);
+            }
         } else if (val) {
-            vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                               first_block, s->dest[dst_idx] + off,
-                               (i & 4) ? s->uvlinesize : s->linesize,
-                               (i & 4) && (s->flags & CODEC_FLAG_GRAY), NULL);
+            if (i < 4) {
+                off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                   first_block, s->dest[0] + off,
+                                   s->linesize,
+                                   0, NULL);
+            } else {
+                vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                   first_block, s->dest[i - 3],
+                                   s->uvlinesize,
+                                   (s->flags & CODEC_FLAG_GRAY), NULL);
+            }
             if (!v->ttmbf && ttmb < 8)
                 ttmb = -1;
             first_block = 0;
@@ -2059,19 +2028,18 @@ static void vc1_decode_b_mb(VC1Context *v)
 
 /** Decode one B-frame MB (in interlaced field B picture)
  */
-static void vc1_decode_b_mb_intfi(VC1Context *v)
+static void vc1_decode_b_mb_intfi(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i, j;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp = 0; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
     int mb_has_coeffs = 0; /* last_flag */
     int val; /* temp value */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int fwd;
     int dmv_x[2], dmv_y[2], pred_flag[2];
     int bmvtype = BMV_TYPE_BACKWARD;
@@ -2096,37 +2064,51 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
         mb_has_coeffs = idx_mbmode & 1;
         if (mb_has_coeffs)
             cbp = 1 + get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_ICBPCY_VLC_BITS, 2);
-        dst_idx = 0;
         for (i = 0; i < 6; i++) {
             v->a_avail = v->c_avail          = 0;
             v->mb_type[0][s->block_index[i]] = 1;
             s->dc_val[0][s->block_index[i]]  = 0;
-            dst_idx += i >> 2;
             val = ((cbp >> (5 - i)) & 1);
-            if (i == 2 || i == 3 || !s->first_slice_line)
-                v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-            if (i == 1 || i == 3 || s->mb_x)
-                v->c_avail = v->mb_type[0][s->block_index[i] - 1];
-
-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
-            if ((i>3) && (s->flags & CODEC_FLAG_GRAY))
-                continue;
+            if (i < 4) {
+                if (!s->first_slice_line || (i & 2))
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x || (i & 1))
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset, mb_pos);
+            } else {
+                if (!s->first_slice_line)
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x)
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset2, mb_pos);
+                if (s->flags & CODEC_FLAG_GRAY)
+                    continue;
+            }
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
                 for (j = 0; j < 64; j++)
                     s->block[i][j] <<= 1;
-            off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-            s->idsp.put_signed_pixels_clamped(s->block[i],
-                                              s->dest[dst_idx] + off,
-                                              (i & 4) ? s->uvlinesize
-                                                      : s->linesize);
+            if (i < 4) {
+                off  = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[0] + off,
+                                                  s->linesize);
+            } else {
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[i - 3],
+                                                  s->uvlinesize);
+            }
             // TODO: yet to perform loop filter
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (v->fmb_is_raw)
             fwd = v->forward_mb_plane[mb_pos] = get_bits1(gb);
         else
@@ -2174,21 +2156,18 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 bmvtype = BMV_TYPE_FORWARD;
             v->bmvtype  = bmvtype;
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
-                    dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
-                    val = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
-                                                 &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
-                                             &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
-                    }
-                    ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
-                    ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
+            for (i = 0; i < 4; i++) {
+                dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
+                dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
+                if (v->fourmvbp & (8 >> i)) {
+                    get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
+                                             &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
+                                         &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
+                }
+                ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
+                ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -2200,17 +2179,21 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
         if (!v->ttmbf && cbp) {
             ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
         }
-        dst_idx = 0;
         for (i = 0; i < 6; i++) {
             s->dc_val[0][s->block_index[i]] = 0;
-            dst_idx += i >> 2;
-            val = ((cbp >> (5 - i)) & 1);
-            off = (i & 4) ? 0 : (i & 1) * 8 + (i & 2) * 4 * s->linesize;
-            if (val) {
-                vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                   first_block, s->dest[dst_idx] + off,
-                                   (i & 4) ? s->uvlinesize : s->linesize,
-                                   (i & 4) && (s->flags & CODEC_FLAG_GRAY), NULL);
+            if (cbp & (32 >> i)) {
+                if (i < 4) {
+                    off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                    vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                       first_block, s->dest[0] + off,
+                                       s->linesize,
+                                       0, NULL);
+                } else {
+                    vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                       first_block, s->dest[i - 3],
+                                       s->uvlinesize,
+                                       (s->flags & CODEC_FLAG_GRAY), NULL);
+                }
                 if (!v->ttmbf && ttmb < 8)
                     ttmb = -1;
                 first_block = 0;
@@ -2221,12 +2204,11 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
 
 /** Decode one B-frame MB (in interlaced frame B picture)
  */
-static int vc1_decode_b_mb_intfr(VC1Context *v)
+static int vc1_decode_b_mb_intfr(VC1Context *v, int mb_pos)
 {
     MpegEncContext *s = &v->s;
     GetBitContext *gb = &s->gb;
     int i, j;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp = 0; /* cbp decoding stuff */
     int mqdiff, mquant; /* MB quantization */
     int ttmb = v->ttfrm; /* MB Transform type */
@@ -2235,11 +2217,11 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
     int dmv_x, dmv_y; /* Differential MV components */
     int val; /* temp value */
     int first_block = 1;
-    int dst_idx, off;
+    int off;
     int skipped, direct, twomv = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0, mvbp;
-    int stride_y, fieldtx;
+    int fieldtx;
     int bmvtype = BMV_TYPE_BACKWARD;
     int dir, dir2;
 
@@ -2321,33 +2303,42 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         /* Set DC scale - y and c use the same (not sure if necessary here) */
         s->y_dc_scale = s->y_dc_scale_table[mquant];
         s->c_dc_scale = s->c_dc_scale_table[mquant];
-        dst_idx = 0;
         for (i = 0; i < 6; i++) {
             v->a_avail = v->c_avail          = 0;
             v->mb_type[0][s->block_index[i]] = 1;
             s->dc_val[0][s->block_index[i]]  = 0;
-            dst_idx += i >> 2;
             val = ((cbp >> (5 - i)) & 1);
-            if (i == 2 || i == 3 || !s->first_slice_line)
-                v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
-            if (i == 1 || i == 3 || s->mb_x)
-                v->c_avail = v->mb_type[0][s->block_index[i] - 1];
-
-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
-                                   (i & 4) ? v->codingset2 : v->codingset);
-            if (i > 3 && (s->flags & CODEC_FLAG_GRAY))
-                continue;
+            if (i < 4) {
+                if (!s->first_slice_line || (i & 2))
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x || (i & 1))
+                   v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset, mb_pos);
+            } else {
+                if (!s->first_slice_line)
+                    v->a_avail = v->mb_type[0][s->block_index[i] - s->block_wrap[i]];
+                if (s->mb_x)
+                    v->c_avail = v->mb_type[0][s->block_index[i] - 1];
+
+                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                                       v->codingset2, mb_pos);
+                if (s->flags & CODEC_FLAG_GRAY)
+                   continue;
+            }
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (i < 4) {
-                stride_y = s->linesize << fieldtx;
-                off = (fieldtx) ? ((i & 1) * 8) + ((i & 2) >> 1) * s->linesize : (i & 1) * 8 + 4 * (i & 2) * s->linesize;
+                off = fieldtx ? ((i & 1) * 8 + (i & 2 ? s->linesize : 0))
+                              : ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[0] + off,
+                                              s->linesize << fieldtx);
             } else {
-                stride_y = s->uvlinesize;
-                off = 0;
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[i - 3],
+                                              s->uvlinesize);
             }
-            s->idsp.put_signed_pixels_clamped(s->block[i],
-                                              s->dest[dst_idx] + off,
-                                              stride_y);
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
@@ -2386,102 +2377,100 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                 v->mb_type[0][s->block_index[i]] = 0;
             fieldtx = v->fieldtx_plane[mb_pos] = ff_vc1_mbmode_intfrp[0][idx_mbmode][1];
             /* for all motion vector read MVDATA and motion compensate each block */
-            dst_idx = 0;
-            if (direct) {
-                if (twomv) {
+            if (twomv) {
+                if (direct) {
                     for (i = 0; i < 4; i++) {
                         ff_vc1_mc_4mv_luma(v, i, 0, 0);
                         ff_vc1_mc_4mv_luma(v, i, 1, 1);
                     }
                     ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
                     ff_vc1_mc_4mv_chroma4(v, 1, 1, 1);
+                } else if (bmvtype == BMV_TYPE_INTERPOLATED) {
+                    mvbp = v->fourmvbp;
+                    for (i = 0; i < 4; i++) {
+                        dir = i & 1;
+                        dmv_x = dmv_y = 0;
+                        if (mvbp & (8 >> i))
+                            get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                        j = i & 2;
+                        ff_vc1_pred_mv_intfr(v, j, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir);
+                        ff_vc1_mc_4mv_luma(v, j, dir, dir);
+                        ff_vc1_mc_4mv_luma(v, j+1, dir, dir);
+                    }
+
+                    ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
+                    ff_vc1_mc_4mv_chroma4(v, 1, 1, 1);
                 } else {
-                    ff_vc1_mc_1mv(v, 0);
-                    ff_vc1_interp_mc(v);
-                }
-            } else if (twomv && bmvtype == BMV_TYPE_INTERPOLATED) {
-                mvbp = v->fourmvbp;
-                for (i = 0; i < 4; i++) {
-                    dir = i==1 || i==3;
+                    dir = bmvtype == BMV_TYPE_BACKWARD;
+                    dir2 = dir ^ mvsw;
+                    mvbp = v->twomvbp;
                     dmv_x = dmv_y = 0;
-                    val = ((mvbp >> (3 - i)) & 1);
-                    if (val)
+                    if (mvbp & 2)
                         get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                    j = i > 1 ? 2 : 0;
-                    ff_vc1_pred_mv_intfr(v, j, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir);
-                    ff_vc1_mc_4mv_luma(v, j, dir, dir);
-                    ff_vc1_mc_4mv_luma(v, j+1, dir, dir);
-                }
+                    ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir);
 
-                ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
-                ff_vc1_mc_4mv_chroma4(v, 1, 1, 1);
-            } else if (bmvtype == BMV_TYPE_INTERPOLATED) {
-                mvbp = v->twomvbp;
-                dmv_x = dmv_y = 0;
-                if (mvbp & 2)
-                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & 1)
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_pred_mv_intfr(v, 2, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir2);
 
-                ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], 0);
-                ff_vc1_mc_1mv(v, 0);
+                    if (mvsw) {
+                        for (i = 0; i < 2; i++) {
+                            s->mv[dir][i+2][0] = s->mv[dir][i][0] = s->current_picture.motion_val[dir][s->block_index[i+2]][0] = s->current_picture.motion_val[dir][s->block_index[i]][0];
+                            s->mv[dir][i+2][1] = s->mv[dir][i][1] = s->current_picture.motion_val[dir][s->block_index[i+2]][1] = s->current_picture.motion_val[dir][s->block_index[i]][1];
+                            s->mv[dir2][i+2][0] = s->mv[dir2][i][0] = s->current_picture.motion_val[dir2][s->block_index[i]][0] = s->current_picture.motion_val[dir2][s->block_index[i+2]][0];
+                            s->mv[dir2][i+2][1] = s->mv[dir2][i][1] = s->current_picture.motion_val[dir2][s->block_index[i]][1] = s->current_picture.motion_val[dir2][s->block_index[i+2]][1];
+                        }
+                    } else {
+                        ff_vc1_pred_mv_intfr(v, 0, 0, 0, 2, v->range_x, v->range_y, v->mb_type[0], !dir);
+                        ff_vc1_pred_mv_intfr(v, 2, 0, 0, 2, v->range_x, v->range_y, v->mb_type[0], !dir);
+                    }
 
-                dmv_x = dmv_y = 0;
-                if (mvbp & 1)
-                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_mc_4mv_luma(v, 0, dir, 0);
+                    ff_vc1_mc_4mv_luma(v, 1, dir, 0);
+                    ff_vc1_mc_4mv_luma(v, 2, dir2, 0);
+                    ff_vc1_mc_4mv_luma(v, 3, dir2, 0);
+                    ff_vc1_mc_4mv_chroma4(v, dir, dir2, 0);
+                }
+            } else {
+                if (direct) {
+                    ff_vc1_mc_1mv(v, 0);
+                    ff_vc1_interp_mc(v);
+                } else if (bmvtype == BMV_TYPE_INTERPOLATED) {
+                    mvbp = v->twomvbp;
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & 2)
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
 
-                ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], 1);
-                ff_vc1_interp_mc(v);
-            } else if (twomv) {
-                dir = bmvtype == BMV_TYPE_BACKWARD;
-                dir2 = dir;
-                if (mvsw)
-                    dir2 = !dir;
-                mvbp = v->twomvbp;
-                dmv_x = dmv_y = 0;
-                if (mvbp & 2)
-                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir);
+                    ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], 0);
+                    ff_vc1_mc_1mv(v, 0);
 
-                dmv_x = dmv_y = 0;
-                if (mvbp & 1)
-                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                ff_vc1_pred_mv_intfr(v, 2, dmv_x, dmv_y, 2, v->range_x, v->range_y, v->mb_type[0], dir2);
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & 1)
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
 
-                if (mvsw) {
-                    for (i = 0; i < 2; i++) {
-                        s->mv[dir][i+2][0] = s->mv[dir][i][0] = s->current_picture.motion_val[dir][s->block_index[i+2]][0] = s->current_picture.motion_val[dir][s->block_index[i]][0];
-                        s->mv[dir][i+2][1] = s->mv[dir][i][1] = s->current_picture.motion_val[dir][s->block_index[i+2]][1] = s->current_picture.motion_val[dir][s->block_index[i]][1];
-                        s->mv[dir2][i+2][0] = s->mv[dir2][i][0] = s->current_picture.motion_val[dir2][s->block_index[i]][0] = s->current_picture.motion_val[dir2][s->block_index[i+2]][0];
-                        s->mv[dir2][i+2][1] = s->mv[dir2][i][1] = s->current_picture.motion_val[dir2][s->block_index[i]][1] = s->current_picture.motion_val[dir2][s->block_index[i+2]][1];
-                    }
+                    ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], 1);
+                    ff_vc1_interp_mc(v);
                 } else {
-                    ff_vc1_pred_mv_intfr(v, 0, 0, 0, 2, v->range_x, v->range_y, v->mb_type[0], !dir);
-                    ff_vc1_pred_mv_intfr(v, 2, 0, 0, 2, v->range_x, v->range_y, v->mb_type[0], !dir);
-                }
-
-                ff_vc1_mc_4mv_luma(v, 0, dir, 0);
-                ff_vc1_mc_4mv_luma(v, 1, dir, 0);
-                ff_vc1_mc_4mv_luma(v, 2, dir2, 0);
-                ff_vc1_mc_4mv_luma(v, 3, dir2, 0);
-                ff_vc1_mc_4mv_chroma4(v, dir, dir2, 0);
-            } else {
-                dir = bmvtype == BMV_TYPE_BACKWARD;
+                    dir = bmvtype == BMV_TYPE_BACKWARD;
 
-                mvbp = ff_vc1_mbmode_intfrp[0][idx_mbmode][2];
-                dmv_x = dmv_y = 0;
-                if (mvbp)
-                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    mvbp = ff_vc1_mbmode_intfrp[0][idx_mbmode][2];
+                    dmv_x = dmv_y = 0;
+                    if (mvbp)
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
 
-                ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], dir);
-                v->blk_mv_type[s->block_index[0]] = 1;
-                v->blk_mv_type[s->block_index[1]] = 1;
-                v->blk_mv_type[s->block_index[2]] = 1;
-                v->blk_mv_type[s->block_index[3]] = 1;
-                ff_vc1_pred_mv_intfr(v, 0, 0, 0, 2, v->range_x, v->range_y, 0, !dir);
-                for (i = 0; i < 2; i++) {
-                    s->mv[!dir][i+2][0] = s->mv[!dir][i][0] = s->current_picture.motion_val[!dir][s->block_index[i+2]][0] = s->current_picture.motion_val[!dir][s->block_index[i]][0];
-                    s->mv[!dir][i+2][1] = s->mv[!dir][i][1] = s->current_picture.motion_val[!dir][s->block_index[i+2]][1] = s->current_picture.motion_val[!dir][s->block_index[i]][1];
+                    ff_vc1_pred_mv_intfr(v, 0, dmv_x, dmv_y, 1, v->range_x, v->range_y, v->mb_type[0], dir);
+                    v->blk_mv_type[s->block_index[0]] = 1;
+                    v->blk_mv_type[s->block_index[1]] = 1;
+                    v->blk_mv_type[s->block_index[2]] = 1;
+                    v->blk_mv_type[s->block_index[3]] = 1;
+                    ff_vc1_pred_mv_intfr(v, 0, 0, 0, 2, v->range_x, v->range_y, 0, !dir);
+                    for (i = 0; i < 2; i++) {
+                        s->mv[!dir][i+2][0] = s->mv[!dir][i][0] = s->current_picture.motion_val[!dir][s->block_index[i+2]][0] = s->current_picture.motion_val[!dir][s->block_index[i]][0];
+                        s->mv[!dir][i+2][1] = s->mv[!dir][i][1] = s->current_picture.motion_val[!dir][s->block_index[i+2]][1] = s->current_picture.motion_val[!dir][s->block_index[i]][1];
+                    }
+                    ff_vc1_mc_1mv(v, dir);
                 }
-                ff_vc1_mc_1mv(v, dir);
             }
 
             if (cbp)
@@ -2491,17 +2480,22 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                 ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
             for (i = 0; i < 6; i++) {
                 s->dc_val[0][s->block_index[i]] = 0;
-                dst_idx += i >> 2;
-                val = ((cbp >> (5 - i)) & 1);
-                if (!fieldtx)
-                    off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-                else
-                    off = (i & 4) ? 0 : ((i & 1) * 8 + ((i > 1) * s->linesize));
-                if (val) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                             first_block, s->dest[dst_idx] + off,
-                                             (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                if (cbp & (32 >> i)) {
+                    if (i < 4) {
+                        if (!fieldtx)
+                            off = ((i & 1) + (i & 2 ? s->linesize : 0)) * 8;
+                        else
+                            off = ((i & 1) * 8 + (i & 2 ? s->linesize : 0));                        
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                             first_block, s->dest[0] + off,
+                                             (s->linesize << fieldtx),
+                                             0, &block_tt);
+                    } else {
+                        pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                             first_block, s->dest[i - 3],
+                                             s->uvlinesize,
+                                             (s->flags & CODEC_FLAG_GRAY), &block_tt);
+                    }
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -2530,9 +2524,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                     dir = bmvtype == BMV_TYPE_BACKWARD;
                     ff_vc1_pred_mv_intfr(v, 0, 0, 0, 1, v->range_x, v->range_y, v->mb_type[0], dir);
                     if (mvsw) {
-                        int dir2 = dir;
-                        if (mvsw)
-                            dir2 = !dir;
+                        int dir2 = !dir;
                         for (i = 0; i < 2; i++) {
                             s->mv[dir][i+2][0] = s->mv[dir][i][0] = s->current_picture.motion_val[dir][s->block_index[i+2]][0] = s->current_picture.motion_val[dir][s->block_index[i]][0];
                             s->mv[dir][i+2][1] = s->mv[dir][i][1] = s->current_picture.motion_val[dir][s->block_index[i+2]][1] = s->current_picture.motion_val[dir][s->block_index[i]][1];
@@ -2574,7 +2566,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
     MpegEncContext *s = &v->s;
     int cbp, val;
     uint8_t *coded_val;
-    int mb_pos;
+    int mb_index = 0;
 
     /* select codingmode used for VLC tables selection */
     switch (v->y_ac_table_index) {
@@ -2606,7 +2598,6 @@ static void vc1_decode_i_blocks(VC1Context *v)
     s->c_dc_scale = s->c_dc_scale_table[v->pq];
 
     //do frame decode
-    s->mb_x = s->mb_y = 0;
     s->mb_intra         = 1;
     s->first_slice_line = 1;
     for (s->mb_y = 0; s->mb_y < s->end_mb_y; s->mb_y++) {
@@ -2622,9 +2613,8 @@ static void vc1_decode_i_blocks(VC1Context *v)
             dst[4] = s->dest[1];
             dst[5] = s->dest[2];
             s->bdsp.clear_blocks(s->block[0]);
-            mb_pos = s->mb_x + s->mb_y * s->mb_width;
-            s->current_picture.mb_type[mb_pos]                     = MB_TYPE_INTRA;
-            s->current_picture.qscale_table[mb_pos]                = v->pq;
+            s->current_picture.mb_type[mb_index + s->mb_x]         = MB_TYPE_INTRA;
+            s->current_picture.qscale_table[mb_index + s->mb_x]    = v->pq;
             s->current_picture.motion_val[1][s->block_index[0]][0] = 0;
             s->current_picture.motion_val[1][s->block_index[0]][1] = 0;
 
@@ -2639,28 +2629,28 @@ static void vc1_decode_i_blocks(VC1Context *v)
                     int pred   = vc1_coded_block_pred(&v->s, k, &coded_val);
                     val        = val ^ pred;
                     *coded_val = val;
-                }
-                cbp |= val << (5 - k);
-
-                vc1_decode_i_block(v, s->block[k], k, val, (k < 4) ? v->codingset : v->codingset2);
 
-                if (k > 3 && (s->flags & CODEC_FLAG_GRAY))
-                    continue;
+                    vc1_decode_i_block(v, s->block[k], k, val, v->codingset);
+                } else {
+                    vc1_decode_i_block(v, s->block[k], k, val, v->codingset2);
+                    if (s->flags & CODEC_FLAG_GRAY)
+                        continue;
+                }
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
                 if (v->pq >= 9 && v->overlap) {
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[k][j] <<= 1;
                     s->idsp.put_signed_pixels_clamped(s->block[k], dst[k],
-                                                      k & 4 ? s->uvlinesize
-                                                            : s->linesize);
+                                                      k < 4 ? s->linesize
+                                                            : s->uvlinesize);
                 } else {
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[k][j] = (s->block[k][j] - 64) << 1;
                     s->idsp.put_pixels_clamped(s->block[k], dst[k],
-                                               k & 4 ? s->uvlinesize
-                                                     : s->linesize);
+                                               k < 4 ? s->linesize
+                                                     : s->uvlinesize);
                 }
             }
 
@@ -2702,6 +2692,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
             ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
 
         s->first_slice_line = 0;
+        mb_index += s->mb_width;
     }
     if (v->s.loop_filter)
         ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
@@ -2750,24 +2741,19 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
     }
 
     // do frame decode
-    s->mb_x             = s->mb_y = 0;
     s->mb_intra         = 1;
     s->first_slice_line = 1;
-    s->mb_y             = s->start_mb_y;
-    if (s->start_mb_y) {
-        s->mb_x = 0;
-        init_block_index(v);
-        memset(&s->coded_block[s->block_index[0] - s->b8_stride], 0,
-               (1 + s->b8_stride) * sizeof(*s->coded_block));
-    }
-    for (; s->mb_y < s->end_mb_y; s->mb_y++) {
+    mb_pos              = s->start_mb_y * s->mb_stride;
+    for (s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         init_block_index(v);
-        for (;s->mb_x < s->mb_width; s->mb_x++) {
+        if (s->mb_y == s->start_mb_y && s->start_mb_y)
+            memset(&s->coded_block[s->block_index[0] - s->b8_stride], 0,
+                   (1 + s->b8_stride) * sizeof(*s->coded_block));
+        for (; s->mb_x < s->mb_width; s->mb_x++) {
             int16_t (*block)[64] = v->block[v->cur_blk_idx];
             ff_update_block_index(s);
             s->bdsp.clear_blocks(block[0]);
-            mb_pos = s->mb_x + s->mb_y * s->mb_stride;
             s->current_picture.mb_type[mb_pos + v->mb_off]                         = MB_TYPE_INTRA;
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = 0;
@@ -2798,22 +2784,28 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                     int pred   = vc1_coded_block_pred(&v->s, k, &coded_val);
                     val        = val ^ pred;
                     *coded_val = val;
-                }
-                cbp |= val << (5 - k);
 
-                v->a_avail = !s->first_slice_line || (k == 2 || k == 3);
-                v->c_avail = !!s->mb_x || (k == 1 || k == 3);
+                    v->a_avail = !s->first_slice_line || (k & 2);
+                    v->c_avail = s->mb_x || (k & 1);
 
-                vc1_decode_i_block_adv(v, block[k], k, val,
-                                       (k < 4) ? v->codingset : v->codingset2, mquant);
+                    vc1_decode_i_block_adv(v, block[k], k, val,
+                                       v->codingset, mquant, mb_pos);
+                } else {
+                    v->a_avail = !s->first_slice_line;
+                    v->c_avail = !!s->mb_x;
+
+                    vc1_decode_i_block_adv(v, block[k], k, val,
+                                       v->codingset2, mquant, mb_pos);
+
+                    if (s->flags & CODEC_FLAG_GRAY)
+                        continue;
+                }
 
-                if (k > 3 && (s->flags & CODEC_FLAG_GRAY))
-                    continue;
                 v->vc1dsp.vc1_inv_trans_8x8(block[k]);
             }
 
             ff_vc1_smooth_overlap_filter_iblk(v);
-            vc1_put_signed_blocks_clamped(v);
+            vc1_put_signed_blocks_clamped(v, mb_pos);
             if (v->s.loop_filter)
                 ff_vc1_loop_filter_iblk_delayed(v, v->pq);
 
@@ -2824,26 +2816,28 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                        get_bits_count(&s->gb), v->bits);
                 return;
             }
+            mb_pos++;
         }
         if (!v->s.loop_filter)
             ff_mpeg_draw_horiz_band(s, s->mb_y * 16, 16);
         else if (s->mb_y)
-            ff_mpeg_draw_horiz_band(s, (s->mb_y-1) * 16, 16);
+            ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
+        mb_pos += s->mb_stride - s->mb_width;
     }
 
     /* raw bottom MB row */
     s->mb_x = 0;
     init_block_index(v);
-
-    for (;s->mb_x < s->mb_width; s->mb_x++) {
+    for (; s->mb_x < s->mb_width; s->mb_x++) {
         ff_update_block_index(s);
-        vc1_put_signed_blocks_clamped(v);
+        vc1_put_signed_blocks_clamped(v, mb_pos);
         if (v->s.loop_filter)
             ff_vc1_loop_filter_iblk_delayed(v, v->pq);
+        mb_pos++;
     }
     if (v->s.loop_filter)
-        ff_mpeg_draw_horiz_band(s, (s->end_mb_y-1)*16, 16);
+        ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
     ff_er_add_slice(&s->er, 0, s->start_mb_y << v->field_mode, s->mb_width - 1,
                     (s->end_mb_y << v->field_mode) - 1, ER_MB_END);
 }
@@ -2852,6 +2846,7 @@ static void vc1_decode_p_blocks(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int apply_loop_filter;
+    int mb_pos;
 
     /* select codingmode used for VLC tables selection */
     switch (v->c_ac_table_index) {
@@ -2881,7 +2876,8 @@ static void vc1_decode_p_blocks(VC1Context *v)
     apply_loop_filter   = s->loop_filter && !(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY) &&
                           v->fcm == PROGRESSIVE;
     s->first_slice_line = 1;
-    memset(v->cbp_base, 0, sizeof(v->cbp_base[0])*2*s->mb_stride);
+    memset(v->cbp_base, 0, sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
+    mb_pos              = s->start_mb_y * s->mb_stride;
     for (s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         init_block_index(v);
@@ -2889,10 +2885,11 @@ static void vc1_decode_p_blocks(VC1Context *v)
             ff_update_block_index(s);
 
             if (v->fcm == ILACE_FIELD)
-                vc1_decode_p_mb_intfi(v);
+                vc1_decode_p_mb_intfi(v, mb_pos);
             else if (v->fcm == ILACE_FRAME)
-                vc1_decode_p_mb_intfr(v);
-            else vc1_decode_p_mb(v);
+                vc1_decode_p_mb_intfr(v, mb_pos);
+            else
+                vc1_decode_p_mb(v, mb_pos);
             if (s->mb_y != s->start_mb_y && apply_loop_filter)
                 ff_vc1_apply_p_loop_filter(v);
             if (get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) {
@@ -2902,13 +2899,16 @@ static void vc1_decode_p_blocks(VC1Context *v)
                        get_bits_count(&s->gb), v->bits, s->mb_x, s->mb_y);
                 return;
             }
+            mb_pos++;
         }
         memmove(v->cbp_base,      v->cbp,      sizeof(v->cbp_base[0])      * s->mb_stride);
         memmove(v->ttblk_base,    v->ttblk,    sizeof(v->ttblk_base[0])    * s->mb_stride);
         memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
         memmove(v->luma_mv_base,  v->luma_mv,  sizeof(v->luma_mv_base[0])  * s->mb_stride);
-        if (s->mb_y != s->start_mb_y) ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
+        if (s->mb_y != s->start_mb_y)
+            ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
+        mb_pos += s->mb_stride - s->mb_width;
     }
     if (apply_loop_filter) {
         s->mb_x = 0;
@@ -2927,6 +2927,7 @@ static void vc1_decode_p_blocks(VC1Context *v)
 static void vc1_decode_b_blocks(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
+    int mb_pos;
 
     /* select codingmode used for VLC tables selection */
     switch (v->c_ac_table_index) {
@@ -2954,6 +2955,7 @@ static void vc1_decode_b_blocks(VC1Context *v)
     }
 
     s->first_slice_line = 1;
+    mb_pos              = s->start_mb_y * s->mb_stride;
     for (s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         init_block_index(v);
@@ -2961,11 +2963,11 @@ static void vc1_decode_b_blocks(VC1Context *v)
             ff_update_block_index(s);
 
             if (v->fcm == ILACE_FIELD)
-                vc1_decode_b_mb_intfi(v);
+                vc1_decode_b_mb_intfi(v, mb_pos);
             else if (v->fcm == ILACE_FRAME)
-                vc1_decode_b_mb_intfr(v);
+                vc1_decode_b_mb_intfr(v, mb_pos);
             else
-                vc1_decode_b_mb(v);
+                vc1_decode_b_mb(v, mb_pos);
             if (get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) {
                 // TODO: may need modification to handle slice coding
                 ff_er_add_slice(&s->er, 0, s->start_mb_y, s->mb_x, s->mb_y, ER_MB_ERROR);
@@ -2975,12 +2977,14 @@ static void vc1_decode_b_blocks(VC1Context *v)
             }
             if (v->s.loop_filter)
                 ff_vc1_loop_filter_iblk(v, v->pq);
+            mb_pos++;
         }
         if (!v->s.loop_filter)
             ff_mpeg_draw_horiz_band(s, s->mb_y * 16, 16);
         else if (s->mb_y)
             ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
+        mb_pos += s->mb_stride - s->mb_width;
     }
     if (v->s.loop_filter)
         ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
@@ -3012,7 +3016,6 @@ static void vc1_decode_skip_blocks(VC1Context *v)
 
 void ff_vc1_decode_blocks(VC1Context *v)
 {
-
     v->s.esc3_level_length = 0;
     if (v->x8_type) {
         ff_intrax8_decode_picture(&v->x8, 2*v->pq + v->halfpq, v->pq * !v->pquantizer);
-- 
2.1.0





More information about the ffmpeg-devel mailing list