[FFmpeg-devel] [PATCH] VP8 decoder
Michael Niedermayer
michaelni
Wed Jun 16 04:22:15 CEST 2010
On Tue, Jun 15, 2010 at 09:12:30AM -0400, David Conrad wrote:
> Hi,
>
> Since I might not have much time to work on this soon, I figure a round of review might be good.
>
> It's bitexact for all streams I have that do not use the bilinear filter (so not test vectors 3,4,7)
>
> Speed is currently ~10% faster than libvpx for intra-only, and 40-50% faster for normal video, probably mostly due to faster C loop filter functions.
>
> I tried implementing the border saving in h264 so that the loop filter could be run on the same row that was just decoded, but that isn't working yet.
>
> Edge emulation isn't fully supported as of yet since intra pred relies on pixels outside the frame being 127/129.
>
> arm/h264pred_init_arm.c | 7 +-
> h264pred.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++--
> h264pred.h | 2
> 3 files changed, 131 insertions(+), 8 deletions(-)
> 6af3e7c9c48fce13890c77b5dd4ffd12e788efd8 0001-VP8-intra-pred-functions.patch
> From b34add5a2859966864346b189bb493f5b773a662 Mon Sep 17 00:00:00 2001
> From: David Conrad <lessen42 at gmail.com>
> Date: Tue, 15 Jun 2010 08:57:35 -0400
> Subject: [PATCH 1/2] VP8 intra pred functions
>
> ---
> libavcodec/arm/h264pred_init_arm.c | 7 +-
> libavcodec/h264pred.c | 130 ++++++++++++++++++++++++++++++++++--
> libavcodec/h264pred.h | 2 +
> 3 files changed, 131 insertions(+), 8 deletions(-)
>
[...]
> +static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
> + const int lt= src[-1-1*stride];
> + LOAD_TOP_EDGE
> + LOAD_TOP_RIGHT_EDGE
> +
> + // 32-bit stores maybe
yes
[...]
> @@ -539,6 +610,20 @@ static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
> pred16x16_plane_compat_c(src, stride, 0, 1);
> }
>
> +static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
> + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
> + int tl = src[-1-stride];
> + uint8_t *top = src-stride;
> + int x, y;
> +
> + for (y = 0; y < 16; y++) {
> + int l = src[-1];
> + for (x = 0; x < 16; x++)
> + src[x] = cm[l + top[x] - tl];
&cm[l - tl]
can be factored out
[...]
> +static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
> + uint8_t *intra4x4)
> +{
> + VP56RangeCoder *c = &s->c;
> + int n;
> +
> + if (s->segmentation.update_map)
> + mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
> +
> + mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
> +
> + if (s->keyframe) {
> + mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
> +
> + if (mb->mode == MODE_I4x4)
> + decode_intra4x4_modes(c, intra4x4, s->b4_stride, 1);
> + else
{}
also it might be worth to try patcheck
[...]
> +static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
> +{
> + int i, t0, t1, t2, t3;
> +
> + for (i = 0; i < 4; i++) {
> + t0 = dc[0*4+i] + dc[3*4+i];
> + t1 = dc[1*4+i] + dc[2*4+i];
> + t2 = dc[1*4+i] - dc[2*4+i];
> + t3 = dc[0*4+i] - dc[3*4+i];
> +
> + dc[0*4+i] = t0 + t1;
> + dc[1*4+i] = t3 + t2;
> + dc[2*4+i] = t0 - t1;
> + dc[3*4+i] = t3 - t2;
> + }
> +
> + for (i = 0; i < 4; i++) {
> + t0 = dc[i*4+0] + dc[i*4+3];
> + t1 = dc[i*4+1] + dc[i*4+2];
> + t2 = dc[i*4+1] - dc[i*4+2];
> + t3 = dc[i*4+0] - dc[i*4+3];
> +
> + *block[i][0] = (t0 + t1 + 3) >> 3;
> + *block[i][1] = (t3 + t2 + 3) >> 3;
> + *block[i][2] = (t0 - t1 + 3) >> 3;
> + *block[i][3] = (t3 - t2 + 3) >> 3;
the +3 can be moved up and factored
[...]
> +
> +#define clip_int8(x) av_clip(x, -128, 127)
> +
> +static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap)
> +{
> + LOAD_PIXELS
> + int a, f1, f2;
> +
> + a = 3*(q0 - p0);
> +
> + if (is4tap)
> + a += clip_int8(p1 - q1);
> +
> + a = clip_int8(a);
> +
> + // We deviate from the spec here with c(a+3) >> 3
> + // since that's what libvpx does.
> + f1 = clip_int8(a+4) >> 3;
> + f2 = clip_int8(a+3) >> 3;
these only need cliping in one direction
and or the clip_int8() could be avoided
> +
> + // Despite what the spec says, we do need to clamp here to
> + // be bitexact with libvpx.
> + p[-1*stride] = av_clip_uint8(p0 + f2);
> + p[ 0*stride] = av_clip_uint8(q0 - f1);
for the !is4tap i think theres a bit more cliping done than needed
> +
> + // only used for _inner on blocks without high edge variance
> + if (!is4tap) {
> + a = (f1+1)>>1;
> + p[-2*stride] = av_clip_uint8(p1 + a);
> + p[ 1*stride] = av_clip_uint8(q1 - a);
> + }
> +}
> +
> +static av_always_inline int simple_limit(uint8_t *p, int stride, int flim)
> +{
> + LOAD_PIXELS
> + return 2*FFABS(p0-q0) + FFABS(p1-q1)/2 <= flim;
>>1 instead of /2 may be faster
> +}
> +
> +/**
> + * E - limit at the macroblock edge
> + * I - limit for interior difference
> + */
> +static av_always_inline int normal_limit(uint8_t *p, int stride, int E, int I)
> +{
> + LOAD_PIXELS
> + return simple_limit(p, stride, 2*E+I)
> + && FFABS(p3-p2) <= I && FFABS(p2-p1) <= I && FFABS(p1-p0) <= I
> + && FFABS(q3-q2) <= I && FFABS(q2-q1) <= I && FFABS(q1-q0) <= I;
> +}
> +
> +// high edge variance
> +static av_always_inline int hev(uint8_t *p, int stride, int thresh)
> +{
> + LOAD_PIXELS
> + return FFABS(p1-p0) > thresh || FFABS(q1-q0) > thresh;
> +}
> +
> +static av_always_inline void filter_mbedge(uint8_t *p, int stride)
> +{
> + int a0, a1, a2, w;
> +
> + LOAD_PIXELS
> +
> + w = clip_int8(p1-q1);
> + w = clip_int8(w + 3*(q0-p0));
> +
> + a0 = clip_int8((27*w + 63) >> 7);
> + a1 = clip_int8((18*w + 63) >> 7);
> + a2 = clip_int8(( 9*w + 63) >> 7);
the cliping is unneeded
[...]
> +static const uint8_t subpel_filters[7][6] = {
> + { 0, 6, 123, 12, 1, 0 },
> + { 2, 11, 108, 36, 8, 1 },
> + { 0, 9, 93, 50, 6, 0 },
> + { 3, 16, 77, 77, 16, 3 },
> + { 0, 6, 50, 93, 9, 0 },
> + { 1, 8, 36, 108, 11, 2 },
> + { 0, 1, 12, 123, 6, 0 },
> +};
> +
> +
> +#define FILTER_6TAP(src, F, stride) \
> + av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \
> + F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64) >> 7)
> +
> +#define VP8_EPEL(SIZE) \
> +static void put_vp8_epel ## SIZE ## _h_c(uint8_t *dst, uint8_t *src, int stride, int h, int mx, int my) \
> +{ \
> + const uint8_t *filter = subpel_filters[mx-1]; \
> + int x, y; \
> +\
> + for (y = 0; y < h; y++) { \
> + for (x = 0; x < SIZE; x++) \
> + dst[x] = FILTER_6TAP(src, filter, 1); \
> + dst += stride; \
> + src += stride; \
> + } \
> +} \
> +\
maybe it makes sense to write seperate functions for thr 4tap cases instead of 0*
also i dont mind if you commit it to svn to continue development there.
It surely is usefull alraedy to people if its faster than libvpx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Opposition brings concord. Out of discord comes the fairest harmony.
-- Heraclitus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100616/c2cd4c07/attachment.pgp>
More information about the ffmpeg-devel
mailing list