[FFmpeg-devel] [PATCH] Fix non-rounding up to next 16-bit aligned bug in IFF decoder
Michael Niedermayer
michaelni
Thu Apr 29 20:22:04 CEST 2010
On Thu, Apr 29, 2010 at 12:51:44PM -0400, Ronald S. Bultje wrote:
> Hi,
>
> On Thu, Apr 29, 2010 at 10:37 AM, Sebastian Vater
> <cdgs.basty at googlemail.com> wrote:
> > ? ?START_TIMER;
> > ? ?const uint8_t *end = dst + (buf_size * 8);
> > ? ?const uint32_t *lut = plane8_lut[plane];
> > ? ?for(; dst < end; dst += 8) {
> > ? ? ? ?const uint8_t lut_offsets = *buf++;
> > ? ? ? ?uint32_t v ?= AV_RN32A(dst) | lut[lut_offsets >> 4];
> > ? ? ? ?uint32_t v2 ?= AV_RN32A(dst + 4) | lut[lut_offsets & 0x0F];
> > ? ? ? ?AV_WN32A(dst, v);
> > ? ? ? ?AV_WN32A(dst + 4, v2);
> > ? ?}
> > ? ?STOP_TIMER("decodeplane8");
> [..]
> > ? ? ?52: ? ? ? 0f b6 06 ? ? ? ? ? ? ? ?movzbl (%esi),%eax
> > ? ? ?55: ? ? ? 83 c6 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%esi
>
> const uint8_t lut_offsets = *buf++;
>
> > ? ? ?58: ? ? ? 8b 53 04 ? ? ? ? ? ? ? ?mov ? ?0x4(%ebx),%edx
>
> read dst+4
>
> > ? ? ?5b: ? ? ? 89 c1 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%ecx
>
> move lut_offset from eac->ecx
>
> > ? ? ?5d: ? ? ? 83 e1 0f ? ? ? ? ? ? ? ?and ? ?$0xf,%ecx
>
> ecx has lower 4 bits
>
> > ? ? ?60: ? ? ? 0b 14 8f ? ? ? ? ? ? ? ?or ? ? (%edi,%ecx,4),%edx
>
> edx |= table
>
> > ? ? ?63: ? ? ? c0 e8 04 ? ? ? ? ? ? ? ?shr ? ?$0x4,%al
>
> eax has higher 4 bits
>
> > ? ? ?66: ? ? ? 0f b6 c0 ? ? ? ? ? ? ? ?movzbl %al,%eax
>
> Err...?
>
> > ? ? ?69: ? ? ? 8b 04 87 ? ? ? ? ? ? ? ?mov ? ?(%edi,%eax,4),%eax
>
> table value
>
> > ? ? ?6c: ? ? ? 09 03 ? ? ? ? ? ? ? ? ? or ? ? %eax,(%ebx)
>
> read/write dest (note how this is a single instruction).
>
> > ? ? ?6e: ? ? ? 89 53 04 ? ? ? ? ? ? ? ?mov ? ?%edx,0x4(%ebx)
>
> write dst+4 (note how this isn't a single instruction)
>
> > ? ? ?71: ? ? ? 83 c3 08 ? ? ? ? ? ? ? ?add ? ?$0x8,%ebx
> > ? ? ?74: ? ? ? 39 dd ? ? ? ? ? ? ? ? ? cmp ? ?%ebx,%ebp
> > ? ? ?76: ? ? ? 77 da ? ? ? ? ? ? ? ? ? ja ? ? 52 <decodeplane8+0x52>
>
> loop end
>
> You probably see from the disassembly you could remove another few
> instructions by doing the calculations serially instead of parallel.
> For example, it reads/writes dst in a single instruction, but uses
> several instructions for dst+4. Serializing this might improve the
> code another few instructions.
>
> for() {
> uint32_t v;
> v = AV_RN32A(..) | lut[x>>4];
> AV_WN32A(dst, v);
> v = AV_RN32A(..) | lut[x&0xf];
> AV_WN32A(dst, v+4);
> }
>
> Ideally this becomes (12 instructions instead of original 14):
>
> movzbl (%esi),%eax
> add $0x1,%esi
> mov %eax,%ecx
> shr $0x4,%eax
> and $0xf,%ecx
> mov (%edi,%eax,4),%eax
> mov (%edi,%ecx,4),%ecx
> or %eax,(%ebx)
> or %ecx,(%ebx, 4) <- not sure if that's possible
> add $0x8,%ebx
> cmp %ebx,%ebp
> ja 52 <decodeplane8+0x52>
movzbl (%esi,%ebx),%eax
mov (%edi,%eax,8),%ecx
or %ecx,(%edx, %ebx)
mov 4(%edi,%eax,8),%eax
or %eax,4(%edx, %ebx)
add $0x8,%ebx
jnc
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
The bravest are surely those who have the clearest vision
of what is before them, glory and danger alike, and yet
notwithstanding go out to meet it. -- Thucydides
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100429/2d800326/attachment.pgp>
More information about the ffmpeg-devel
mailing list