[Ffmpeg-devel] Trellis Quantization Applied To (A)DPCM

Mon Jun 5 07:01:01 CEST 2006

On Sun, 4 Jun 2006, Michael Niedermayer wrote:
> On Sat, Jun 03, 2006 at 09:45:36PM -0700, Loren Merritt wrote:
>>
>> Hmm... my initial implementation doesn't see noticeable gains from K-Means
>> over just using a constant exponential table.
>
> i think that in case of optimally choosing the deltas a less "regular" table
> then your constant table would be supperior, the reason why i think so is that
> in your constant table many sums of elements are equal which should lead to
> fewer "easy reachable" values, for example:
>
> if we have a input of 0 12 3 then that could be encoded with +16,-16 or
> +8,-8 with your table if the table where slightly less regular then the
> 2 cases would be able to reach 2 different final results and that might
> lead to a better approximation ...

Added some asymmetric tables. They helped only a little, but I don't 
really know what to optimize for when writing a table by hand.

OTOH, a brute-force iterative search isn't too ridiculously slow. The next 
step would be to run a clustering algorithm like I proposed before, but 
this time looking for N tables to jointly fit all the frames of a training 
set, instead of clustering based on the statistics of one frame at a time.

--Loren Merritt
-------------- next part --------------
Index: allcodecs.c
===================================================================

--- allcodecs.c	(revision 5451)
+++ allcodecs.c	(working copy)
@@ -172,6 +172,9 @@
 #ifdef CONFIG_SNOW_ENCODER
     register_avcodec(&snow_encoder);
 #endif //CONFIG_SNOW_ENCODER
+#ifdef CONFIG_CYUV_ENCODER
+    register_avcodec(&cyuv_encoder);
+#endif //CONFIG_CYUV_ENCODER
 #ifdef CONFIG_ZLIB_ENCODER
     register_avcodec(&zlib_encoder);
 #endif //CONFIG_ZLIB_ENCODER
Index: avcodec.h
===================================================================
--- avcodec.h	(revision 5451)
+++ avcodec.h	(working copy)
@@ -2097,6 +2097,7 @@
 extern AVCodec vcr1_encoder;
 extern AVCodec ffv1_encoder;
 extern AVCodec snow_encoder;
+extern AVCodec cyuv_encoder;
 extern AVCodec mdec_encoder;
 extern AVCodec zlib_encoder;
 extern AVCodec sonic_encoder;
Index: cyuv.c
===================================================================
--- cyuv.c	(revision 5451)
+++ cyuv.c	(working copy)
@@ -28,6 +28,10 @@
  * Creative YUV (CYUV) Video Decoder.
  */
 
+#define DEBUG
+#undef NDEBUG
+#include <assert.h>
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -39,15 +43,20 @@
 #include "mpegvideo.h"
 
 
-typedef struct CyuvDecodeContext {
+typedef struct CyuvContext {
     AVCodecContext *avctx;
     int width, height;
     AVFrame frame;
-} CyuvDecodeContext;
+} CyuvContext;
 
+#define swap(a,b) {typeof(a) t = a; a = b; b = t;}
+#define put_pack4to8(a,b) (*buf++ = (a) + ((b)<<4))
+#undef printf
+
+
 static int cyuv_decode_init(AVCodecContext *avctx)
 {
-    CyuvDecodeContext *s = avctx->priv_data;
+    CyuvContext *s = avctx->priv_data;
 
     s->avctx = avctx;
     s->width = avctx->width;
@@ -65,7 +74,7 @@
                              void *data, int *data_size,
                              uint8_t *buf, int buf_size)
 {
-    CyuvDecodeContext *s=avctx->priv_data;
+    CyuvContext *s=avctx->priv_data;
 
     unsigned char *y_plane;
     unsigned char *u_plane;
@@ -168,16 +177,386 @@
 
 static int cyuv_decode_end(AVCodecContext *avctx)
 {
-/*    CyuvDecodeContext *s = avctx->priv_data;*/
+/*    CyuvContext *s = avctx->priv_data;*/
 
     return 0;
 }
 
+static int cyuv_encode_init(AVCodecContext *avctx)
+{
+    CyuvContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+    s->height = avctx->height;
+    s->width = avctx->width;
+    /* width needs to be divisible by 4 for this codec to work */
+    if (s->width & 0x3)
+        return -1;
+    if (avctx->pix_fmt != PIX_FMT_YUV411P)
+        return -1;
+    avctx->pix_fmt = PIX_FMT_YUV411P;
+    avctx->bits_per_sample= 12;
+    avctx->coded_frame= &s->frame;
+
+    return 0;
+}
+
+static int quantize_sample(int sample, int *pred, const int8_t *table, const uint8_t *itable)
+{
+    int p = *pred;
+    int bi = itable[(sample - p) & 0xff];
+    *pred = p + table[bi];
+    if(*pred & ~0xff){
+        int i, bscore=INT_MAX;
+        for(i=0; i<16; i++){
+            int score = abs(sample - ((p + table[i]) & 0xff));
+            if(score < bscore){
+                bscore = score;
+                bi = i;
+            }
+        }
+        *pred = (p + table[bi]) & 0xff;
+    }
+    return bi;
+}
+
+typedef struct TrellisPath {
+    int nibble;
+    int prev;
+} TrellisPath;
+
+typedef struct TrellisNode {
+    uint32_t ssd;
+    int path;
+    int pred;
+} TrellisNode;
+
+static int quantize_row_trellis(AVCodecContext *avctx, const uint8_t *samples,
+                                uint8_t *dst, const int8_t *table, const uint8_t *itable, int n)
+{
+    const int frontier = 1 << avctx->trellis;
+    const int max_paths = frontier*n;
+    TrellisPath paths[max_paths], *p;
+    TrellisNode node_buf[2][frontier];
+    TrellisNode *nodep_buf[2][frontier];
+    TrellisNode **nodes = nodep_buf[0]; // nodes[] is always sorted by .ssd
+    TrellisNode **nodes_next = nodep_buf[1];
+    int pathn = 0, i, j, k, d;
+
+    memset(nodep_buf, 0, sizeof(nodep_buf));
+    nodes[0] = &node_buf[0][0];
+    paths[0].nibble = samples[0] >> 4;
+    nodes[0]->pred = paths[0].nibble << 4;
+    nodes[0]->path = pathn++;
+    d = samples[0] - nodes[0]->pred;
+    nodes[0]->ssd = d*d;
+    if(samples[0] < 0xf0) {
+        nodes[1] = &node_buf[0][1];
+        paths[1].nibble = (samples[0] >> 4) + 1;
+        nodes[1]->pred = paths[1].nibble << 4;
+        nodes[1]->path = pathn++;
+        d = samples[0] - nodes[1]->pred;
+        nodes[1]->ssd = d*d;
+        if(nodes[1]->ssd < nodes[0]->ssd)
+            swap(nodes[1], nodes[0]);
+    }
+
+    for(i=1; i<n; i++) {
+        TrellisNode *t = node_buf[i&1];
+        TrellisNode **u;
+        const int sample = samples[i];
+        int range = 2;
+        memset(nodes_next, 0, frontier*sizeof(TrellisNode*));
+        for(j=0; j<frontier && nodes[j]; j++) {
+            const int pred = nodes[j]->pred;
+            const int div = itable[(sample - pred) & 0xff];
+            const int nmin = FFMAX(div-range, 0);
+            const int nmax = FFMIN(div+range, 15);
+            int nibble;
+            range = 1;
+            for(nibble=nmin; nibble<=nmax; nibble++) {
+                int dec_sample = (pred + table[nibble]) & 0xff;
+                int d = sample - dec_sample;
+                uint32_t ssd = nodes[j]->ssd + d*d;
+                if(nodes_next[frontier-1] && ssd >= nodes_next[frontier-1]->ssd)
+                    continue;
+                for(k=0; k<frontier && nodes_next[k]; k++) {
+                    if(dec_sample == nodes_next[k]->pred) {
+                        assert(ssd >= nodes_next[k]->ssd);
+                        goto next_nibble;
+                    }
+                }
+                for(k=0; k<frontier; k++) {
+                    if(!nodes_next[k] || ssd < nodes_next[k]->ssd) {
+                        TrellisNode *u = nodes_next[frontier-1];
+                        if(!u) {
+                            u = t++;
+                            u->path = pathn++;
+                        }
+                        u->ssd = ssd;
+                        u->pred = dec_sample;
+                        paths[u->path].nibble = nibble;
+                        paths[u->path].prev = nodes[j]->path;
+                        memmove(&nodes_next[k+1], &nodes_next[k], (frontier-k-1)*sizeof(TrellisNode*));
+                        nodes_next[k] = u;
+                        break;
+                    }
+                }
+                next_nibble:;
+            }
+        }
+
+        u = nodes;
+        nodes = nodes_next;
+        nodes_next = u;
+    }
+
+    p = &paths[nodes[0]->path];
+    for(i=n-1; i>=0; i--) {
+        dst[i] = p->nibble;
+        p = &paths[p->prev];
+    }
+    return nodes[0]->ssd;
+}
+
+static void build_inverse_table(uint8_t *itable, const int8_t *table)
+{
+    int d, i;
+    for(i=1; i<16; i++)
+        assert(table[i] > table[i-1]);
+    for(d=-128; d<128; d++){
+        int bscore = INT_MAX;
+        for(i=0; i<16; i++){
+            int score = abs(d - table[i]);
+            if(score > bscore)
+                break;
+            bscore = score;
+        }
+        itable[d&0xff] = i-1;
+    }
+}
+
+static int tables_tried = 0;
+
+static int try_row(const uint8_t *samples, const int8_t *table,
+                   const uint8_t *itable, int n)
+{
+    // trellis doesn't make much difference here, so just use the fast method
+    int pred = clip_uint8(samples[0]+8) & -16;
+    int d = samples[0] - pred;
+    int ssd = d*d;
+    int i;
+    for(i=1; i<n; i++){
+        quantize_sample(samples[i], &pred, table, itable);
+        d = samples[i] - pred;
+        ssd += d*d;
+    }
+    return ssd;
+}
+
+static uint64_t try_table(const int8_t *table, int8_t *dst, uint64_t *score,
+                          const uint8_t *pix, int w, int h, int stride)
+{
+    uint64_t ssd = 0;
+    uint8_t itable[256];
+    int y;
+    build_inverse_table(itable, table);
+    for(y=0; y<h; y++, pix+=stride){
+        ssd += try_row(pix, table, itable, w);
+    }
+    if(ssd < *score){
+        *score = ssd;
+        memcpy(dst, table, 16);
+    }
+    tables_tried++;
+    return ssd;
+}
+
+#define N_TABLES 4
+static const int8_t some_tables[N_TABLES][16] = {
+    {-128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64},
+    {-64, -32, -16, -8, -4, -3, -2, -1, 0, 1, 2, 3, 4, 8, 16, 32},
+    {-32, -16, -8, -6, -4, -3, -2, -1, 0, 1, 2, 3, 4, 6, 8, 16},
+    {-12, -8, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 8},
+};
+
+#define M_TABLES 4
+static const int8_t more_tables[M_TABLES][16] = {
+    {-96, -48, -24, -12, -6, -3, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64},
+    {-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 3, 6, 12, 24, 48, 96},
+    {-48, -24, -12, -6, -4, -3, -2, -1, 0, 1, 2, 4, 6, 8, 16, 32},
+    {-32, -16, -8, -6, -4, -2, -1, 0, 1, 2, 3, 4, 6, 12, 24, 48},
+};
+
+static void build_table(CyuvContext *s, int8_t *table, int plane)
+{
+    const uint8_t *pix = s->frame.data[plane];
+    const int stride = s->frame.linesize[plane];
+    const int h = s->height;
+    const int w = s->width >> (plane ? 2 : 0);
+    int i, j;
+    uint64_t ssd, bssd = 1ULL<<63;
+
+    for(i=0; i<N_TABLES; i++){
+        ssd = try_table(some_tables[plane ? N_TABLES-1-i : i], table, &bssd, pix, w, h, stride);
+        printf("%7lx ", ssd);
+        if(ssd == 0){
+            printf("\n");
+            return;
+        }
+        if(ssd > bssd){
+            for(i++ ;i<N_TABLES; i++)
+                printf("        ");
+            break;
+        }
+    }
+    if(table[0] <= -64){
+        for(i=0; i<M_TABLES; i++){
+            ssd = try_table(more_tables[i], table, &bssd, pix, w, h, stride);
+            printf("%7lx ", ssd);
+        }
+    }
+    printf("\n");
+
+    // Iterative refinement:
+    // 11-dimensional diamond search, encoding the whole frame at each step
+    if(s->avctx->context_model >= 2){
+        int8_t tmp_buf[18];
+        int8_t *tmp = tmp_buf+1;
+        tmp[-1] = (int8_t)-129;
+        tmp[16] = (int8_t)128;
+        for(j=0; j<40; j++){
+            int bssd_bakj = bssd;
+            printf("%7lx ", bssd);
+            for(i=0; i<16; i++)
+                printf(" %2x", table[i]&0xff);
+            printf("\n");
+            for(i=0; i<16; i++){
+                int v = table[i];
+                int bssd_baki = bssd;
+                if(v>=-2 && v<=2)
+                    continue;
+                memcpy(tmp, table, 16);
+                while((int8_t)(tmp[i]+1) != tmp[i+1] && tmp[i]-v <= j+5){
+                    tmp[i]++;
+                    ssd = try_table(tmp, table, &bssd, pix, w, h, stride);
+                    if(ssd > bssd)
+                        break;
+                }
+                if(bssd < bssd_baki)
+                    continue;
+                tmp[i] = v;
+                while((int8_t)(tmp[i]-1) != tmp[i-1] && v-tmp[i] <= j+5){
+                    tmp[i]--;
+                    ssd = try_table(tmp, table, &bssd, pix, w, h, stride);
+                    if(ssd > bssd)
+                        break;
+                }
+            }
+            if(bssd == bssd_bakj)
+                break;
+        }
+    }
+}
+
+static int cyuv_encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data)
+{
+    CyuvContext *s = avctx->priv_data;
+    AVFrame *pict = data;
+    int8_t *y_table = buf +  0;
+    int8_t *u_table = buf + 16;
+    int8_t *v_table = buf + 32;
+    uint8_t y_itable[256], u_itable[256], v_itable[256];
+    int x, y, p;
+    const int coded_size = 48 + s->height * s->width * 3/4;
+    assert(buf_size >= coded_size);
+    s->frame = *pict;
+    pict->error[0] = pict->error[1] = pict->error[2] = 0;
+
+    if(avctx->context_model > 0){
+        build_table(s, y_table, 0);
+        build_table(s, u_table, 1);
+        build_table(s, v_table, 2);
+        {
+            static int fn = 0;
+            printf("fn %d tables_tried %d \n", fn, tables_tried);
+            fn++;
+        }
+    }else{
+        memcpy(y_table, some_tables[0], 16);
+        memcpy(u_table, some_tables[1], 16);
+        memcpy(v_table, some_tables[1], 16);
+    }
+    build_inverse_table(y_itable, y_table);
+    build_inverse_table(u_itable, u_table);
+    build_inverse_table(v_itable, v_table);
+    buf += 48;
+
+    for(y=0; y<s->height; y++){
+        uint8_t *y_ptr = pict->data[0] + y*pict->linesize[0];
+        uint8_t *u_ptr = pict->data[1] + y*pict->linesize[1];
+        uint8_t *v_ptr = pict->data[2] + y*pict->linesize[2];
+        int y0, y1;
+
+        if(avctx->trellis > 0){
+            uint8_t y_buf[s->width];
+            uint8_t u_buf[s->width>>2];
+            uint8_t v_buf[s->width>>2];
+            pict->error[0] += quantize_row_trellis(avctx, y_ptr, y_buf, y_table, y_itable, s->width);
+            pict->error[1] += quantize_row_trellis(avctx, u_ptr, u_buf, u_table, u_itable, s->width>>2);
+            pict->error[2] += quantize_row_trellis(avctx, v_ptr, v_buf, v_table, v_itable, s->width>>2);
+            for(x=0; x<s->width; x+=4){
+                put_pack4to8(y_buf[x+0], u_buf[x>>2]);
+                put_pack4to8(y_buf[x+1], v_buf[x>>2]);
+                put_pack4to8(y_buf[x+2], y_buf[x+3]);
+            }
+        }else{
+            int y_pred = clip_uint8(y_ptr[0]+8) & -16;
+            int u_pred = clip_uint8(u_ptr[0]+8) & -16;
+            int v_pred = clip_uint8(v_ptr[0]+8) & -16;
+            put_pack4to8(y_pred>>4, u_pred>>4);
+            y0 = quantize_sample(y_ptr[1], &y_pred, y_table, y_itable);
+            put_pack4to8(y0, v_pred>>4);
+            y0 = quantize_sample(y_ptr[2], &y_pred, y_table, y_itable);
+            y1 = quantize_sample(y_ptr[3], &y_pred, y_table, y_itable);
+            put_pack4to8(y0, y1);
+            for(x=4; x<s->width; x+=4){
+                y_ptr+=4;
+                u_ptr++;
+                v_ptr++;
+                y0 = quantize_sample(y_ptr[0], &y_pred, y_table, y_itable);
+                y1 = quantize_sample(u_ptr[0], &u_pred, u_table, u_itable);
+                put_pack4to8(y0, y1);
+                y0 = quantize_sample(y_ptr[1], &y_pred, y_table, y_itable);
+                y1 = quantize_sample(v_ptr[0], &v_pred, v_table, v_itable);
+                put_pack4to8(y0, y1);
+                y0 = quantize_sample(y_ptr[2], &y_pred, y_table, y_itable);
+                y1 = quantize_sample(y_ptr[3], &y_pred, y_table, y_itable);
+                put_pack4to8(y0, y1);
+            }
+        }
+    }
+
+    fflush(stdout);
+
+    s->frame = *pict;
+    avctx->coded_frame = &s->frame;
+    for(p=0; p<3; p++)
+        avctx->error[p] += pict->error[p];
+    return coded_size;
+}
+
+static int cyuv_encode_end(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+
 AVCodec cyuv_decoder = {
     "cyuv",
     CODEC_TYPE_VIDEO,
     CODEC_ID_CYUV,
-    sizeof(CyuvDecodeContext),
+    sizeof(CyuvContext),
     cyuv_decode_init,
     NULL,
     cyuv_decode_end,
@@ -186,3 +565,12 @@
     NULL
 };
 
+AVCodec cyuv_encoder = {
+    "cyuv",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_CYUV,
+    sizeof(CyuvContext),
+    cyuv_encode_init,
+    cyuv_encode_frame,
+    cyuv_encode_end,
+};