[FFmpeg-devel] [PATCH] VP8 decoder

Ronald S. Bultje rsbultje
Tue Jun 22 16:43:31 CEST 2010


Hi,

On Tue, Jun 15, 2010 at 10:22 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Tue, Jun 15, 2010 at 09:12:30AM -0400, David Conrad wrote:
>> +static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? uint8_t *intra4x4)
>> +{
>> + ? ?VP56RangeCoder *c = &s->c;
>> + ? ?int n;
>> +
>> + ? ?if (s->segmentation.update_map)
>> + ? ? ? ?mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
>> +
>> + ? ?mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
>> +
>> + ? ?if (s->keyframe) {
>> + ? ? ? ?mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
>> +
>
>> + ? ? ? ?if (mb->mode == MODE_I4x4)
>> + ? ? ? ? ? ?decode_intra4x4_modes(c, intra4x4, s->b4_stride, 1);
>> + ? ? ? ?else
>
> {}

Done.

> also it might be worth to try patcheck

Tried, didn't give me anything (I hope that's good).

>> +static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
>> +{
>> + ? ?int i, t0, t1, t2, t3;
>> +
>> + ? ?for (i = 0; i < 4; i++) {
>> + ? ? ? ?t0 = dc[0*4+i] + dc[3*4+i];
>> + ? ? ? ?t1 = dc[1*4+i] + dc[2*4+i];
>> + ? ? ? ?t2 = dc[1*4+i] - dc[2*4+i];
>> + ? ? ? ?t3 = dc[0*4+i] - dc[3*4+i];
>> +
>> + ? ? ? ?dc[0*4+i] = t0 + t1;
>> + ? ? ? ?dc[1*4+i] = t3 + t2;
>> + ? ? ? ?dc[2*4+i] = t0 - t1;
>> + ? ? ? ?dc[3*4+i] = t3 - t2;
>> + ? ?}
>> +
>> + ? ?for (i = 0; i < 4; i++) {
>> + ? ? ? ?t0 = dc[i*4+0] + dc[i*4+3];
>> + ? ? ? ?t1 = dc[i*4+1] + dc[i*4+2];
>> + ? ? ? ?t2 = dc[i*4+1] - dc[i*4+2];
>> + ? ? ? ?t3 = dc[i*4+0] - dc[i*4+3];
>> +
>> + ? ? ? ?*block[i][0] = (t0 + t1 + 3) >> 3;
>> + ? ? ? ?*block[i][1] = (t3 + t2 + 3) >> 3;
>> + ? ? ? ?*block[i][2] = (t0 - t1 + 3) >> 3;
>> + ? ? ? ?*block[i][3] = (t3 - t2 + 3) >> 3;
>
> the +3 can be moved up and factored

Done.

>> +static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap)
>> +{
>> + ? ?LOAD_PIXELS
>> + ? ?int a, f1, f2;
>> +
>> + ? ?a = 3*(q0 - p0);
>> +
>> + ? ?if (is4tap)
>> + ? ? ? ?a += clip_int8(p1 - q1);
>> +
>> + ? ?a = clip_int8(a);
>> +
>
>> + ? ?// We deviate from the spec here with c(a+3) >> 3
>> + ? ?// since that's what libvpx does.
>> + ? ?f1 = clip_int8(a+4) >> 3;
>> + ? ?f2 = clip_int8(a+3) >> 3;
>
> these only need cliping in one direction
> and or the clip_int8() could be avoided

Changed to FFMIN(x, 127).

>> + ? ?// Despite what the spec says, we do need to clamp here to
>> + ? ?// be bitexact with libvpx.
>> + ? ?p[-1*stride] = av_clip_uint8(p0 + f2);
>> + ? ?p[ 0*stride] = av_clip_uint8(q0 - f1);
>
> for the !is4tap i think theres a bit more cliping done than needed

I looked through it and don't think this one can strictly be prevented.

>> +static av_always_inline int simple_limit(uint8_t *p, int stride, int flim)
>> +{
>> + ? ?LOAD_PIXELS
>> + ? ?return 2*FFABS(p0-q0) + FFABS(p1-q1)/2 <= flim;
>
>>>1 instead of /2 may be faster

Changed.

>> +static av_always_inline void filter_mbedge(uint8_t *p, int stride)
>> +{
>> + ? ?int a0, a1, a2, w;
>> +
>> + ? ?LOAD_PIXELS
>> +
>> + ? ?w = clip_int8(p1-q1);
>> + ? ?w = clip_int8(w + 3*(q0-p0));
>> +
>
>> + ? ?a0 = clip_int8((27*w + 63) >> 7);
>> + ? ?a1 = clip_int8((18*w + 63) >> 7);
>> + ? ?a2 = clip_int8(( 9*w + 63) >> 7);
>
> the cliping is unneeded

Indeed, removed.

>> +static const uint8_t subpel_filters[7][6] = {
>> + ? ?{ 0, ? 6, 123, ?12, ? 1, ? 0 },
>> + ? ?{ 2, ?11, 108, ?36, ? 8, ? 1 },
>> + ? ?{ 0, ? 9, ?93, ?50, ? 6, ? 0 },
>> + ? ?{ 3, ?16, ?77, ?77, ?16, ? 3 },
>> + ? ?{ 0, ? 6, ?50, ?93, ? 9, ? 0 },
>> + ? ?{ 1, ? 8, ?36, 108, ?11, ? 2 },
>> + ? ?{ 0, ? 1, ?12, 123, ? 6, ? 0 },
>> +};
>> +
>> +
>> +#define FILTER_6TAP(src, F, stride) \
>> + ? ?av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \
>> + ? ? ? ? ? ? ? ? ? F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64) >> 7)
>> +
>> +#define VP8_EPEL(SIZE) \
>> +static void put_vp8_epel ## SIZE ## _h_c(uint8_t *dst, uint8_t *src, int stride, int h, int mx, int my) \
>> +{ \
>> + ? ?const uint8_t *filter = subpel_filters[mx-1]; \
>> + ? ?int x, y; \
>> +\
>> + ? ?for (y = 0; y < h; y++) { \
>> + ? ? ? ?for (x = 0; x < SIZE; x++) \
>> + ? ? ? ? ? ?dst[x] = FILTER_6TAP(src, filter, 1); \
>> + ? ? ? ?dst += stride; \
>> + ? ? ? ?src += stride; \
>> + ? ?} \
>> +} \
>> +\
>
> maybe it makes sense to write seperate functions for thr 4tap cases instead of 0*

Added as suggested.

> also i dont mind if you commit it to svn to continue development there.
> It surely is usefull alraedy to people if its faster than libvpx

I have some x86 mmx optimizations already so I think this is a good
idea. Please let me know if the current patch is OK to apply.

Ronald
-------------- next part --------------
A non-text attachment was scrubbed...
Name: vp8_decoder.patch
Type: application/octet-stream
Size: 105515 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100622/4a60f92f/attachment.obj>



More information about the ffmpeg-devel mailing list