[Ffmpeg-devel] [PATCH] Chinese AVS video decoder

Michael Niedermayer michaelni
Thu Jun 29 02:56:22 CEST 2006


Hi

On Wed, Jun 28, 2006 at 10:42:42AM +0200, Stefan Gehrer wrote:
[...]
> +#define SLICE_MIN_START_CODE    0x00000101
> +#define SLICE_MAX_START_CODE    0x000001af
> +#define EXT_START_CODE          0x000001b5
> +#define USER_START_CODE         0x000001b2
> +#define SEQ_START_CODE          0x000001b0
> +#define PIC_I_START_CODE        0x000001b3
> +#define PIC_PB_START_CODE       0x000001b6

hmm, somehow i think we should not duplicate them from mpeg12.c and somehow
i think we should


[...]
> +enum pic_t {
> +  PIC_I,
> +  PIC_P,
> +  PIC_B
> +};

use FF_I/P/B_TYPE here


> +
> +static const int frame_rate_tab[][2] = {
> +  {    0,   1 }, //undefined
> +  {24000,1001 }, //23.98
> +  {   24,   1 },
> +  {   25,   1 },
> +  {30000,1001 }, //29.97
> +  {   30,   1 },
> +  {   50,   1 },
> +  {60000,1001 }, //59.94
> +  {   60,   1 },
> +};

this looks like the one from mpeg12data.h, that one should be made non static
and used instead


[...]
> +static const int cbp_tab[64][2] = {
> +  {63, 0},{15,15},{31,63},{47,31},{ 0,16},{14,32},{13,47},{11,13},
> +  { 7,14},{ 5,11},{10,12},{ 8, 5},{12,10},{61, 7},{ 4,48},{55, 3},
> +  { 1, 2},{ 2, 8},{59, 4},{ 3, 1},{62,61},{ 9,55},{ 6,59},{29,62},
> +  {45,29},{51,27},{23,23},{39,19},{27,30},{46,28},{53, 9},{30, 6},
> +  {43,60},{37,21},{60,44},{16,26},{21,51},{28,35},{19,18},{35,20},
> +  {42,24},{26,53},{44,17},{32,37},{58,39},{24,45},{20,58},{17,43},
> +  {18,42},{48,46},{22,36},{33,33},{25,34},{49,40},{40,52},{36,49},
> +  {34,50},{50,56},{52,25},{54,22},{41,54},{56,57},{38,41},{57,38}
> +};

this fits in an uint8_t which means we could save 75% memory, same issue
with many other tables


[...]

> +static const int frame_scan[64] = {
> +  0,   1,  8, 16,  9,  2,  3, 10,
> +  17, 24, 32, 25, 18, 11,  4,  5,
> +  12, 19, 26, 33, 40, 48, 41, 34,
> +  27, 20, 13,  6,  7, 14, 21, 28,
> +  35, 42, 49, 56, 57, 50, 43, 36,
> +  29, 22, 15, 23, 30, 37, 44, 51,
> +  58, 59, 52, 45, 38, 31, 39, 46,
> +  53, 60, 61, 54, 47, 55, 62, 63
> +};

duplicate of ff_zigzag_direct


[...]
> +
> +/*****************************************************************************
> + *
> + * in-loop deblocking filter
> + *
> + ****************************************************************************/
> +
> +#define P2 p0_p[-3*stride]
> +#define P1 p0_p[-2*stride]
> +#define P0 p0_p[-1*stride]
> +#define Q0 p0_p[ 0*stride]
> +#define Q1 p0_p[ 1*stride]
> +#define Q2 p0_p[ 2*stride]
> +
> +static inline void loop_filter_l2(uint8_t *p0_p,int stride,int alpha, int beta) {
> +    int p0 = P0;
> +    int q0 = Q0;
> +
> +    if(abs(p0-q0)<alpha && abs(P1-p0)<beta && abs(Q1-q0)<beta) {
> +        if(abs(P2-p0)<beta && abs(p0-q0)< (alpha/4+2)) {

if alpha is >= 0 then >>2 is faster then /4

> +            P0 = (P1 + 2*p0 + q0 + 2) >> 2;
> +            P1 = (2*P1 + p0 + q0 + 2) >> 2;
> +        } else
> +            P0 = (2*P1 + p0 + q0 + 2) >> 2;
> +        if(abs(Q2-q0)<beta && abs(q0-p0)< (alpha/4+2)) {
> +            Q0 = (Q1 + 2*q0 + p0 + 2) >> 2;
> +            Q1 = (2*Q1 + q0 + p0 + 2) >> 2;
> +        } else
> +            Q0 = (2*Q1 + q0 + p0 + 2) >> 2;
> +    }

this can be simplified to:

int s= P0 + Q0 + 2;
alpha = (alpha>>2) + 2;
if(abs(P2-p0)<beta && abs(p0-q0)< alpha) {
    P0 = (P1 + p0 + s) >> 2;
    P1 = (2*P1 + s) >> 2;
} else
    P0 = (2*P1 + s) >> 2;
if(abs(Q2-q0)<beta && abs(q0-p0)< alpha) {
    Q0 = (Q1 + q0 + s) >> 2;
    Q1 = (2*Q1 + s) >> 2;
} else
    Q0 = (2*Q1 + s) >> 2;


and the whole loop filter stuff should be moved to dsputil (please in 
its own avs specific file as dsputil.c is already pretty large)

[...]

> +
> +static inline int get_bs_p(vector_t *mvP, vector_t *mvQ) {
> +    if((mvP->ref == REF_INTRA) || (mvQ->ref == REF_INTRA))
> +        return 2;
> +    if(mvP->ref != mvQ->ref)
> +        return 1;
> +    if( (abs(mvP->x - mvQ->x) >= 4) ||  (abs(mvP->y - mvQ->y) >= 4) )
> +        return 1;
> +    return 0;
> +}

you really have alot of abs(...) </> CONSTANT in your code, maybe a
macro like
#define ABSCMP(x,c) ((x)+(c)) > (unsigned)(2*(c))

would make the code faster, that of course would need to be benchmarked
if you want to try it


[...]
> +static void intra_pred_vert(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int y;
> +    uint64_t a = *((uint64_t *)(&top[1]));
> +    for(y=0;y<8;y++) {
> +        *((uint64_t *)(d+y*stride)) = a;
> +    }
> +}
> +
> +static void intra_pred_horiz(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int y;
> +    uint64_t a;
> +    for(y=0;y<8;y++) {
> +        a = left[y+1] * 0x0101010101010101ULL;
> +        *((uint64_t *)(d+y*stride)) = a;
> +    }
> +}
> +
> +static void intra_pred_dc_128(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int y;
> +    uint64_t a = 0x8080808080808080ULL;
> +    for(y=0;y<8;y++)
> +        *((uint64_t *)(d+y*stride)) = a;
> +}
> +
> +static void intra_pred_plane(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y,ia;
> +    int ih = 0;
> +    int iv = 0;
> +    uint8_t *cm = cropTbl + MAX_NEG_CROP;
> +
> +    for(x=0; x<4; x++) {
> +        ih += (x+1)*(top[5+x]-top[3-x]);
> +        iv += (x+1)*(left[5+x]-left[3-x]);
> +    }
> +    ia = (top[8]+left[8])<<4;
> +    ih = (17*ih+16)>>5;
> +    iv = (17*iv+16)>>5;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            d[y*stride+x] = cm[(ia+(x-3)*ih+(y-3)*iv+16)>>5];
> +}
> +
> +#define LOWPASS(ARRAY,INDEX)                                            \
> +    (( ARRAY[(INDEX)-1] + 2*ARRAY[(INDEX)] + ARRAY[(INDEX)+1] + 2) >> 2)
> +
> +static void intra_pred_lp(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            d[y*stride+x] = (LOWPASS(top,x+1) + LOWPASS(left,y+1)) >> 1;
> +}
> +
> +static void intra_pred_down_left(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            d[y*stride+x] = (LOWPASS(top,x+y+2) + LOWPASS(left,x+y+2)) >> 1;
> +}
> +
> +static void intra_pred_down_right(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            if(x==y)
> +                d[y*stride+x] = (left[1]+2*top[0]+top[1]+2)>>2;
> +            else if(x>y)
> +                d[y*stride+x] = LOWPASS(top,x-y);
> +            else
> +                d[y*stride+x] = LOWPASS(left,y-x);
> +}
> +
> +static void intra_pred_lp_left(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            d[y*stride+x] = LOWPASS(left,y+1);
> +}
> +
> +static void intra_pred_lp_top(uint8_t *d,uint8_t *top,uint8_t *left,int stride) {
> +    int x,y;
> +    for(y=0; y<8; y++)
> +        for(x=0; x<8; x++)
> +            d[y*stride+x] = LOWPASS(top,x+1);
> +}
> +
> +#undef LOWPASS

some of the above look similar to h.264, they shouldnt be duplicated if they
are identical


[...]

> +/*****************************************************************************
> + *
> + * inverse transform
> + *
> + ****************************************************************************/
> +
> +static void cavs_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride) {
> +    int i;
> +    DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
> +    uint8_t *cm = cropTbl + MAX_NEG_CROP;
> +
> +    for( i = 0; i < 8; i++ ) {
> +        const int a0 =  3*src[i][1] - (src[i][7]<<1);
> +        const int a1 =  3*src[i][3] + (src[i][5]<<1);
> +        const int a2 =  (src[i][3]<<1) - 3*src[i][5];
> +        const int a3 =  (src[i][1]<<1) + 3*src[i][7];
> +
> +        const int b4 = ((a0 + a1 + a3)<<1) + a1;
> +        const int b5 = ((a0 - a1 + a2)<<1) + a0;
> +        const int b6 = ((a3 - a2 - a1)<<1) + a3;
> +        const int b7 = ((a0 - a2 - a3)<<1) - a2;
> +
> +        const int a7 = (src[i][2]<<2) - 10*src[i][6];
> +        const int a6 = (src[i][6]<<2) + 10*src[i][2];
> +        const int a5 = (src[i][0] - src[i][4]) << 3;
> +        const int a4 = (src[i][0] + src[i][4]) << 3;
> +
> +        const int b0 = a4 + a6;
> +        const int b1 = a5 + a7;
> +        const int b2 = a5 - a7;
> +        const int b3 = a4 - a6;
> +
> +        src[i][0] = (b0 + b4 + 4) >> 3;
> +        src[i][1] = (b1 + b5 + 4) >> 3;
> +        src[i][2] = (b2 + b6 + 4) >> 3;
> +        src[i][3] = (b3 + b7 + 4) >> 3;
> +        src[i][4] = (b3 - b7 + 4) >> 3;
> +        src[i][5] = (b2 - b6 + 4) >> 3;
> +        src[i][6] = (b1 - b5 + 4) >> 3;
> +        src[i][7] = (b0 - b4 + 4) >> 3;
> +    }
> +    for( i = 0; i < 8; i++ ) {
> +        const int a0 =  3*src[1][i] - (src[7][i]<<1);
> +        const int a1 =  3*src[3][i] + (src[5][i]<<1);
> +        const int a2 =  (src[3][i]<<1) - 3*src[5][i];
> +        const int a3 =  (src[1][i]<<1) + 3*src[7][i];
> +
> +        const int b4 = ((a0 + a1 + a3)<<1) + a1;
> +        const int b5 = ((a0 - a1 + a2)<<1) + a0;
> +        const int b6 = ((a3 - a2 - a1)<<1) + a3;
> +        const int b7 = ((a0 - a2 - a3)<<1) - a2;
> +
> +        const int a7 = (src[2][i]<<2) - 10*src[6][i];
> +        const int a6 = (src[6][i]<<2) + 10*src[2][i];
> +        const int a5 = (src[0][i] - src[4][i]) << 3;
> +        const int a4 = (src[0][i] + src[4][i]) << 3;
> +
> +        const int b0 = a4 + a6;
> +        const int b1 = a5 + a7;
> +        const int b2 = a5 - a7;
> +        const int b3 = a4 - a6;
> +
> +        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b4 + 64) >> 7)];
> +        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b1 + b5 + 64) >> 7)];
> +        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b2 + b6 + 64) >> 7)];
> +        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b3 + b7 + 64) >> 7)];
> +        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b3 - b7 + 64) >> 7)];
> +        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b2 - b6 + 64) >> 7)];
> +        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b1 - b5 + 64) >> 7)];
> +        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b4 + 64) >> 7)];
> +    }
> +}

above could go into dsputil


[...]
> +static inline void mv_pred_direct(AVSContext *h, vector_t *pmv_fw,
> +                                  vector_t *pmv_bw, vector_t *col_mv) {
> +    pmv_fw->dist = h->dist[1];
> +    pmv_bw->dist = h->dist[0];
> +    pmv_fw->ref = 1;
> +    pmv_bw->ref = 0;
> +    //TODO check if this can be done branchless

of course can it be done branchless (X= (col_mv->x>>31) and ^X-X should help)
and when you are already at it, get rid of the division too


> +    if(col_mv->x < 0) {
> +        pmv_fw->x = -(((16384/col_mv->dist)*(1-col_mv->x*pmv_fw->dist)-1)>>14);
> +        pmv_bw->x =  (((16384/col_mv->dist)*(1-col_mv->x*pmv_bw->dist)-1)>>14);
> +    } else {
> +        pmv_fw->x =  (((16384/col_mv->dist)*(1+col_mv->x*pmv_fw->dist)-1)>>14);
> +        pmv_bw->x = -(((16384/col_mv->dist)*(1+col_mv->x*pmv_bw->dist)-1)>>14);
> +    }

[...]
> +static void cavs_get_buffer(AVCodecContext *avctx, AVSContext *h, Picture *pic) {
> +    if(h->picture.data[0])
> +        cavs_release_buffer(avctx, &h->picture);
> +    pic->linesize[0] = (h->mb_width*16) + 32;
> +    pic->linesize[1] = pic->linesize[2] = (((h->mb_width+1)/2)*16) + 32;
> +    pic->base[0] = av_malloc(pic->linesize[0] * (avctx->height + 32));
> +    pic->base[1] = av_malloc(pic->linesize[1] * (avctx->height/2 + 16));
> +    pic->base[2] = av_malloc(pic->linesize[2] * (avctx->height/2 + 16));
> +    pic->mb_type_base = av_malloc(h->mb_width*h->mb_height*sizeof(uint32_t));
> +    pic->data[0] = pic->base[0] + 16*pic->linesize[0] + 16;
> +    pic->data[1] = pic->base[1] + 8*pic->linesize[1] + 16;
> +    pic->data[2] = pic->base[2] + 8*pic->linesize[2] + 16;
> +}

this is not acceptable, please use get/release_buffer() from AVCodecContext


[...]
> +static int cavs_decode_frame(AVCodecContext * avctx,void *data, int *data_size,
> +                             uint8_t * buf, int buf_size) {
> +    AVSContext *h = avctx->priv_data;
> +    MpegEncContext *s = &h->s;
> +    int input_size;
> +    const uint8_t *buf_end;
> +    const uint8_t *buf_ptr;
> +    AVFrame *picture = data;
> +    int stc = -1;
> +
> +    s->avctx = avctx;
> +
> +    if (buf_size == 0) {
> +        return 0;
> +    }

this is wrong, you must output the not yet outputed delayed frames here

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In the past you could go to a library and read, borrow or copy any book
Today you'd get arrested for mere telling someone where the library is




More information about the ffmpeg-devel mailing list