[Ffmpeg-devel] Trellis Quantization Applied To (A)DPCM
Loren Merritt
lorenm
Mon Jun 5 07:01:01 CEST 2006
On Sun, 4 Jun 2006, Michael Niedermayer wrote:
> On Sat, Jun 03, 2006 at 09:45:36PM -0700, Loren Merritt wrote:
>>
>> Hmm... my initial implementation doesn't see noticeable gains from K-Means
>> over just using a constant exponential table.
>
> i think that in case of optimally choosing the deltas a less "regular" table
> then your constant table would be supperior, the reason why i think so is that
> in your constant table many sums of elements are equal which should lead to
> fewer "easy reachable" values, for example:
>
> if we have a input of 0 12 3 then that could be encoded with +16,-16 or
> +8,-8 with your table if the table where slightly less regular then the
> 2 cases would be able to reach 2 different final results and that might
> lead to a better approximation ...
Added some asymmetric tables. They helped only a little, but I don't
really know what to optimize for when writing a table by hand.
OTOH, a brute-force iterative search isn't too ridiculously slow. The next
step would be to run a clustering algorithm like I proposed before, but
this time looking for N tables to jointly fit all the frames of a training
set, instead of clustering based on the statistics of one frame at a time.
--Loren Merritt
-------------- next part --------------
Index: allcodecs.c
===================================================================
--- allcodecs.c (revision 5451)
+++ allcodecs.c (working copy)
@@ -172,6 +172,9 @@
#ifdef CONFIG_SNOW_ENCODER
register_avcodec(&snow_encoder);
#endif //CONFIG_SNOW_ENCODER
+#ifdef CONFIG_CYUV_ENCODER
+ register_avcodec(&cyuv_encoder);
+#endif //CONFIG_CYUV_ENCODER
#ifdef CONFIG_ZLIB_ENCODER
register_avcodec(&zlib_encoder);
#endif //CONFIG_ZLIB_ENCODER
Index: avcodec.h
===================================================================
--- avcodec.h (revision 5451)
+++ avcodec.h (working copy)
@@ -2097,6 +2097,7 @@
extern AVCodec vcr1_encoder;
extern AVCodec ffv1_encoder;
extern AVCodec snow_encoder;
+extern AVCodec cyuv_encoder;
extern AVCodec mdec_encoder;
extern AVCodec zlib_encoder;
extern AVCodec sonic_encoder;
Index: cyuv.c
===================================================================
--- cyuv.c (revision 5451)
+++ cyuv.c (working copy)
@@ -28,6 +28,10 @@
* Creative YUV (CYUV) Video Decoder.
*/
+#define DEBUG
+#undef NDEBUG
+#include <assert.h>
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -39,15 +43,20 @@
#include "mpegvideo.h"
-typedef struct CyuvDecodeContext {
+typedef struct CyuvContext {
AVCodecContext *avctx;
int width, height;
AVFrame frame;
-} CyuvDecodeContext;
+} CyuvContext;
+#define swap(a,b) {typeof(a) t = a; a = b; b = t;}
+#define put_pack4to8(a,b) (*buf++ = (a) + ((b)<<4))
+#undef printf
+
+
static int cyuv_decode_init(AVCodecContext *avctx)
{
- CyuvDecodeContext *s = avctx->priv_data;
+ CyuvContext *s = avctx->priv_data;
s->avctx = avctx;
s->width = avctx->width;
@@ -65,7 +74,7 @@
void *data, int *data_size,
uint8_t *buf, int buf_size)
{
- CyuvDecodeContext *s=avctx->priv_data;
+ CyuvContext *s=avctx->priv_data;
unsigned char *y_plane;
unsigned char *u_plane;
@@ -168,16 +177,386 @@
static int cyuv_decode_end(AVCodecContext *avctx)
{
-/* CyuvDecodeContext *s = avctx->priv_data;*/
+/* CyuvContext *s = avctx->priv_data;*/
return 0;
}
+static int cyuv_encode_init(AVCodecContext *avctx)
+{
+ CyuvContext *s = avctx->priv_data;
+
+ s->avctx = avctx;
+ s->height = avctx->height;
+ s->width = avctx->width;
+ /* width needs to be divisible by 4 for this codec to work */
+ if (s->width & 0x3)
+ return -1;
+ if (avctx->pix_fmt != PIX_FMT_YUV411P)
+ return -1;
+ avctx->pix_fmt = PIX_FMT_YUV411P;
+ avctx->bits_per_sample= 12;
+ avctx->coded_frame= &s->frame;
+
+ return 0;
+}
+
+static int quantize_sample(int sample, int *pred, const int8_t *table, const uint8_t *itable)
+{
+ int p = *pred;
+ int bi = itable[(sample - p) & 0xff];
+ *pred = p + table[bi];
+ if(*pred & ~0xff){
+ int i, bscore=INT_MAX;
+ for(i=0; i<16; i++){
+ int score = abs(sample - ((p + table[i]) & 0xff));
+ if(score < bscore){
+ bscore = score;
+ bi = i;
+ }
+ }
+ *pred = (p + table[bi]) & 0xff;
+ }
+ return bi;
+}
+
+typedef struct TrellisPath {
+ int nibble;
+ int prev;
+} TrellisPath;
+
+typedef struct TrellisNode {
+ uint32_t ssd;
+ int path;
+ int pred;
+} TrellisNode;
+
+static int quantize_row_trellis(AVCodecContext *avctx, const uint8_t *samples,
+ uint8_t *dst, const int8_t *table, const uint8_t *itable, int n)
+{
+ const int frontier = 1 << avctx->trellis;
+ const int max_paths = frontier*n;
+ TrellisPath paths[max_paths], *p;
+ TrellisNode node_buf[2][frontier];
+ TrellisNode *nodep_buf[2][frontier];
+ TrellisNode **nodes = nodep_buf[0]; // nodes[] is always sorted by .ssd
+ TrellisNode **nodes_next = nodep_buf[1];
+ int pathn = 0, i, j, k, d;
+
+ memset(nodep_buf, 0, sizeof(nodep_buf));
+ nodes[0] = &node_buf[0][0];
+ paths[0].nibble = samples[0] >> 4;
+ nodes[0]->pred = paths[0].nibble << 4;
+ nodes[0]->path = pathn++;
+ d = samples[0] - nodes[0]->pred;
+ nodes[0]->ssd = d*d;
+ if(samples[0] < 0xf0) {
+ nodes[1] = &node_buf[0][1];
+ paths[1].nibble = (samples[0] >> 4) + 1;
+ nodes[1]->pred = paths[1].nibble << 4;
+ nodes[1]->path = pathn++;
+ d = samples[0] - nodes[1]->pred;
+ nodes[1]->ssd = d*d;
+ if(nodes[1]->ssd < nodes[0]->ssd)
+ swap(nodes[1], nodes[0]);
+ }
+
+ for(i=1; i<n; i++) {
+ TrellisNode *t = node_buf[i&1];
+ TrellisNode **u;
+ const int sample = samples[i];
+ int range = 2;
+ memset(nodes_next, 0, frontier*sizeof(TrellisNode*));
+ for(j=0; j<frontier && nodes[j]; j++) {
+ const int pred = nodes[j]->pred;
+ const int div = itable[(sample - pred) & 0xff];
+ const int nmin = FFMAX(div-range, 0);
+ const int nmax = FFMIN(div+range, 15);
+ int nibble;
+ range = 1;
+ for(nibble=nmin; nibble<=nmax; nibble++) {
+ int dec_sample = (pred + table[nibble]) & 0xff;
+ int d = sample - dec_sample;
+ uint32_t ssd = nodes[j]->ssd + d*d;
+ if(nodes_next[frontier-1] && ssd >= nodes_next[frontier-1]->ssd)
+ continue;
+ for(k=0; k<frontier && nodes_next[k]; k++) {
+ if(dec_sample == nodes_next[k]->pred) {
+ assert(ssd >= nodes_next[k]->ssd);
+ goto next_nibble;
+ }
+ }
+ for(k=0; k<frontier; k++) {
+ if(!nodes_next[k] || ssd < nodes_next[k]->ssd) {
+ TrellisNode *u = nodes_next[frontier-1];
+ if(!u) {
+ u = t++;
+ u->path = pathn++;
+ }
+ u->ssd = ssd;
+ u->pred = dec_sample;
+ paths[u->path].nibble = nibble;
+ paths[u->path].prev = nodes[j]->path;
+ memmove(&nodes_next[k+1], &nodes_next[k], (frontier-k-1)*sizeof(TrellisNode*));
+ nodes_next[k] = u;
+ break;
+ }
+ }
+ next_nibble:;
+ }
+ }
+
+ u = nodes;
+ nodes = nodes_next;
+ nodes_next = u;
+ }
+
+ p = &paths[nodes[0]->path];
+ for(i=n-1; i>=0; i--) {
+ dst[i] = p->nibble;
+ p = &paths[p->prev];
+ }
+ return nodes[0]->ssd;
+}
+
+static void build_inverse_table(uint8_t *itable, const int8_t *table)
+{
+ int d, i;
+ for(i=1; i<16; i++)
+ assert(table[i] > table[i-1]);
+ for(d=-128; d<128; d++){
+ int bscore = INT_MAX;
+ for(i=0; i<16; i++){
+ int score = abs(d - table[i]);
+ if(score > bscore)
+ break;
+ bscore = score;
+ }
+ itable[d&0xff] = i-1;
+ }
+}
+
+static int tables_tried = 0;
+
+static int try_row(const uint8_t *samples, const int8_t *table,
+ const uint8_t *itable, int n)
+{
+ // trellis doesn't make much difference here, so just use the fast method
+ int pred = clip_uint8(samples[0]+8) & -16;
+ int d = samples[0] - pred;
+ int ssd = d*d;
+ int i;
+ for(i=1; i<n; i++){
+ quantize_sample(samples[i], &pred, table, itable);
+ d = samples[i] - pred;
+ ssd += d*d;
+ }
+ return ssd;
+}
+
+static uint64_t try_table(const int8_t *table, int8_t *dst, uint64_t *score,
+ const uint8_t *pix, int w, int h, int stride)
+{
+ uint64_t ssd = 0;
+ uint8_t itable[256];
+ int y;
+ build_inverse_table(itable, table);
+ for(y=0; y<h; y++, pix+=stride){
+ ssd += try_row(pix, table, itable, w);
+ }
+ if(ssd < *score){
+ *score = ssd;
+ memcpy(dst, table, 16);
+ }
+ tables_tried++;
+ return ssd;
+}
+
+#define N_TABLES 4
+static const int8_t some_tables[N_TABLES][16] = {
+ {-128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64},
+ {-64, -32, -16, -8, -4, -3, -2, -1, 0, 1, 2, 3, 4, 8, 16, 32},
+ {-32, -16, -8, -6, -4, -3, -2, -1, 0, 1, 2, 3, 4, 6, 8, 16},
+ {-12, -8, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 8},
+};
+
+#define M_TABLES 4
+static const int8_t more_tables[M_TABLES][16] = {
+ {-96, -48, -24, -12, -6, -3, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64},
+ {-64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 3, 6, 12, 24, 48, 96},
+ {-48, -24, -12, -6, -4, -3, -2, -1, 0, 1, 2, 4, 6, 8, 16, 32},
+ {-32, -16, -8, -6, -4, -2, -1, 0, 1, 2, 3, 4, 6, 12, 24, 48},
+};
+
+static void build_table(CyuvContext *s, int8_t *table, int plane)
+{
+ const uint8_t *pix = s->frame.data[plane];
+ const int stride = s->frame.linesize[plane];
+ const int h = s->height;
+ const int w = s->width >> (plane ? 2 : 0);
+ int i, j;
+ uint64_t ssd, bssd = 1ULL<<63;
+
+ for(i=0; i<N_TABLES; i++){
+ ssd = try_table(some_tables[plane ? N_TABLES-1-i : i], table, &bssd, pix, w, h, stride);
+ printf("%7lx ", ssd);
+ if(ssd == 0){
+ printf("\n");
+ return;
+ }
+ if(ssd > bssd){
+ for(i++ ;i<N_TABLES; i++)
+ printf(" ");
+ break;
+ }
+ }
+ if(table[0] <= -64){
+ for(i=0; i<M_TABLES; i++){
+ ssd = try_table(more_tables[i], table, &bssd, pix, w, h, stride);
+ printf("%7lx ", ssd);
+ }
+ }
+ printf("\n");
+
+ // Iterative refinement:
+ // 11-dimensional diamond search, encoding the whole frame at each step
+ if(s->avctx->context_model >= 2){
+ int8_t tmp_buf[18];
+ int8_t *tmp = tmp_buf+1;
+ tmp[-1] = (int8_t)-129;
+ tmp[16] = (int8_t)128;
+ for(j=0; j<40; j++){
+ int bssd_bakj = bssd;
+ printf("%7lx ", bssd);
+ for(i=0; i<16; i++)
+ printf(" %2x", table[i]&0xff);
+ printf("\n");
+ for(i=0; i<16; i++){
+ int v = table[i];
+ int bssd_baki = bssd;
+ if(v>=-2 && v<=2)
+ continue;
+ memcpy(tmp, table, 16);
+ while((int8_t)(tmp[i]+1) != tmp[i+1] && tmp[i]-v <= j+5){
+ tmp[i]++;
+ ssd = try_table(tmp, table, &bssd, pix, w, h, stride);
+ if(ssd > bssd)
+ break;
+ }
+ if(bssd < bssd_baki)
+ continue;
+ tmp[i] = v;
+ while((int8_t)(tmp[i]-1) != tmp[i-1] && v-tmp[i] <= j+5){
+ tmp[i]--;
+ ssd = try_table(tmp, table, &bssd, pix, w, h, stride);
+ if(ssd > bssd)
+ break;
+ }
+ }
+ if(bssd == bssd_bakj)
+ break;
+ }
+ }
+}
+
+static int cyuv_encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data)
+{
+ CyuvContext *s = avctx->priv_data;
+ AVFrame *pict = data;
+ int8_t *y_table = buf + 0;
+ int8_t *u_table = buf + 16;
+ int8_t *v_table = buf + 32;
+ uint8_t y_itable[256], u_itable[256], v_itable[256];
+ int x, y, p;
+ const int coded_size = 48 + s->height * s->width * 3/4;
+ assert(buf_size >= coded_size);
+ s->frame = *pict;
+ pict->error[0] = pict->error[1] = pict->error[2] = 0;
+
+ if(avctx->context_model > 0){
+ build_table(s, y_table, 0);
+ build_table(s, u_table, 1);
+ build_table(s, v_table, 2);
+ {
+ static int fn = 0;
+ printf("fn %d tables_tried %d \n", fn, tables_tried);
+ fn++;
+ }
+ }else{
+ memcpy(y_table, some_tables[0], 16);
+ memcpy(u_table, some_tables[1], 16);
+ memcpy(v_table, some_tables[1], 16);
+ }
+ build_inverse_table(y_itable, y_table);
+ build_inverse_table(u_itable, u_table);
+ build_inverse_table(v_itable, v_table);
+ buf += 48;
+
+ for(y=0; y<s->height; y++){
+ uint8_t *y_ptr = pict->data[0] + y*pict->linesize[0];
+ uint8_t *u_ptr = pict->data[1] + y*pict->linesize[1];
+ uint8_t *v_ptr = pict->data[2] + y*pict->linesize[2];
+ int y0, y1;
+
+ if(avctx->trellis > 0){
+ uint8_t y_buf[s->width];
+ uint8_t u_buf[s->width>>2];
+ uint8_t v_buf[s->width>>2];
+ pict->error[0] += quantize_row_trellis(avctx, y_ptr, y_buf, y_table, y_itable, s->width);
+ pict->error[1] += quantize_row_trellis(avctx, u_ptr, u_buf, u_table, u_itable, s->width>>2);
+ pict->error[2] += quantize_row_trellis(avctx, v_ptr, v_buf, v_table, v_itable, s->width>>2);
+ for(x=0; x<s->width; x+=4){
+ put_pack4to8(y_buf[x+0], u_buf[x>>2]);
+ put_pack4to8(y_buf[x+1], v_buf[x>>2]);
+ put_pack4to8(y_buf[x+2], y_buf[x+3]);
+ }
+ }else{
+ int y_pred = clip_uint8(y_ptr[0]+8) & -16;
+ int u_pred = clip_uint8(u_ptr[0]+8) & -16;
+ int v_pred = clip_uint8(v_ptr[0]+8) & -16;
+ put_pack4to8(y_pred>>4, u_pred>>4);
+ y0 = quantize_sample(y_ptr[1], &y_pred, y_table, y_itable);
+ put_pack4to8(y0, v_pred>>4);
+ y0 = quantize_sample(y_ptr[2], &y_pred, y_table, y_itable);
+ y1 = quantize_sample(y_ptr[3], &y_pred, y_table, y_itable);
+ put_pack4to8(y0, y1);
+ for(x=4; x<s->width; x+=4){
+ y_ptr+=4;
+ u_ptr++;
+ v_ptr++;
+ y0 = quantize_sample(y_ptr[0], &y_pred, y_table, y_itable);
+ y1 = quantize_sample(u_ptr[0], &u_pred, u_table, u_itable);
+ put_pack4to8(y0, y1);
+ y0 = quantize_sample(y_ptr[1], &y_pred, y_table, y_itable);
+ y1 = quantize_sample(v_ptr[0], &v_pred, v_table, v_itable);
+ put_pack4to8(y0, y1);
+ y0 = quantize_sample(y_ptr[2], &y_pred, y_table, y_itable);
+ y1 = quantize_sample(y_ptr[3], &y_pred, y_table, y_itable);
+ put_pack4to8(y0, y1);
+ }
+ }
+ }
+
+ fflush(stdout);
+
+ s->frame = *pict;
+ avctx->coded_frame = &s->frame;
+ for(p=0; p<3; p++)
+ avctx->error[p] += pict->error[p];
+ return coded_size;
+}
+
+static int cyuv_encode_end(AVCodecContext *avctx)
+{
+ return 0;
+}
+
+
AVCodec cyuv_decoder = {
"cyuv",
CODEC_TYPE_VIDEO,
CODEC_ID_CYUV,
- sizeof(CyuvDecodeContext),
+ sizeof(CyuvContext),
cyuv_decode_init,
NULL,
cyuv_decode_end,
@@ -186,3 +565,12 @@
NULL
};
+AVCodec cyuv_encoder = {
+ "cyuv",
+ CODEC_TYPE_VIDEO,
+ CODEC_ID_CYUV,
+ sizeof(CyuvContext),
+ cyuv_encode_init,
+ cyuv_encode_frame,
+ cyuv_encode_end,
+};
More information about the ffmpeg-devel
mailing list