[FFmpeg-devel] [PATCH] Efficiently support several output pixel formats in Cinepak decoder

u-9iep at aetey.se u-9iep at aetey.se
Wed Feb 1 21:37:02 EET 2017


Hello,

On Sat, Jan 28, 2017 at 07:53:06PM +0100, Michael Niedermayer wrote:
> please provide a git compatible patch
> git format-patch / send-email

The corresponding patches (concerning comments in cinepak-related files)
have been resent in a git-compatible form 2017-01-29.
This patch applies after them.

It represents a large change in the Cinepak decoder for speed/efficiency.
Cinepak was always very good at speed, now it has become even (much) better.

Bought by the code size growth is a reduction of in-loop checking and
a capability to produce formats directly suitable for output devices,
without applying pixel format conversion afterwards to the whole frame.

Decoding to certain formats is also remarkably faster than to the default
rgb24, on i686
 to rgb32 seems to be about 15% faster than to rgb24
 to rgb565 seems to be over 20% faster than to rgb24

Besides this, on a slow device using a 16-bit depth provides a remarkable
speedup per se, compared to larger depths.

Avoiding frame pixel format conversion by generating rgb565 in the decoder
for a corresponsing video buffer yields in our tests (on MMX-capable
i*86) more than twice [sic] the playback speed compared to decoding to rgb24.

The default output format remains rgb24, with the full quality offered
by the codec.

Splitting this into multiple patches does not look very useful,
the changes make sense together.

Regards,
Rune
-------------- next part --------------
>From 300b8a4e712d1a404983b245aac501e09326ee72 Mon Sep 17 00:00:00 2001
From: Rl <addr-see-the-website at aetey.se>
Date: Wed, 1 Feb 2017 19:44:40 +0100
Subject: [PATCH] Efficiently support several output pixel formats in Cinepak
 decoder

Optimize decoding in general and largely improve speed,
among others by the ability to produce device-native pixel formats.

The output format can be chosen at runtime by an option.

The format can be also chosen by setting environment variable
CINEPAK_OUTPUT_FORMAT_OVERRIDE, if this is enabled at build time.
---
 libavcodec/cinepak.c | 957 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 845 insertions(+), 112 deletions(-)

diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index d657e9c0c1..76105fcc0c 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -31,6 +31,8 @@
  *
  * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
  * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
+ * Extra output formats / optimizations (c) 2017 Rl, Aetey Global Technologies AB
+ * @author Extra output formats / optimizations, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -39,23 +41,37 @@
 
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+/* #include "libavutil/avassert.h" */
 #include "avcodec.h"
 #include "internal.h"
 
+/* rounding to nearest is quite cheap in my tests
+ * and yields a remarkable quality improvement
+ * compared to simple truncation -- rl */
+#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3))
 
-typedef uint8_t cvid_codebook[12];
+typedef union cvid_codebook {
+    uint32_t  rgb32[256][ 4];
+    uint8_t   rgb24[256][12];
+    uint16_t rgb565[256][ 4];
+    uint8_t  yuv420[256][ 6];
+    uint8_t    pal8[256][ 4];
+} cvid_codebook;
 
-#define MAX_STRIPS      32
+#define MAX_STRIPS      32    /* an arbitrary limit -- rl */
 
 typedef struct cvid_strip {
     uint16_t          id;
     uint16_t          x1, y1;
     uint16_t          x2, y2;
-    cvid_codebook     v4_codebook[256];
-    cvid_codebook     v1_codebook[256];
+    cvid_codebook     v4_codebook;
+    cvid_codebook     v1_codebook;
 } cvid_strip;
 
-typedef struct CinepakContext {
+typedef struct CinepakContext CinepakContext;
+struct CinepakContext {
+    const AVClass *class;
 
     AVCodecContext *avctx;
     AVFrame *frame;
@@ -71,24 +87,111 @@ typedef struct CinepakContext {
     int sega_film_skip_bytes;
 
     uint32_t pal[256];
-} CinepakContext;
 
-static void cinepak_decode_codebook (cvid_codebook *codebook,
-                                     int chunk_id, int size, const uint8_t *data)
+    void (*decode_codebook)(CinepakContext *s,
+                            cvid_codebook *codebook, int chunk_id,
+                            int size, const uint8_t *data);
+    int  (*decode_vectors)(CinepakContext *s, cvid_strip *strip,
+                           int chunk_id, int size, const uint8_t *data);
+/* options */
+    enum AVPixelFormat out_pixfmt;
+};
+
+#define OFFSET(x) offsetof(CinepakContext, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+{"output_pixel_format", "set output pixel format: rgb24/rgb32/rgb565/yuv420p/pal8; yuv420p is approximate", OFFSET(out_pixfmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_RGB24}, -1, INT_MAX, VD },
+    { NULL },
+};
+
+static const AVClass cinepak_class = {
+    .class_name = "cinepak decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static void cinepak_decode_codebook_rgb32 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t *eod = (data + size);
+    uint32_t flag, mask;
+    int      i, n;
+    uint32_t *p;
+    int palette_video = s->palette_video;
+    int selective_update = (chunk_id & 0x01);
+
+    /* check if this chunk contains 4- or 6-element vectors */
+    n    = (chunk_id & 0x04) ? 4 : 6;
+    flag = 0;
+    mask = 0;
+
+    p = codebook->rgb32[0];
+    for (i=0; i < 256; i++) {
+        if (selective_update && !(mask >>= 1)) {
+            if ((data + 4) > eod)
+                break;
+
+            flag  = AV_RB32 (data);
+            data += 4;
+            mask  = 0x80000000;
+        }
+
+        if (!selective_update || (flag & mask)) {
+            int k;
+
+            if ((data + n) > eod)
+                break;
+
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k)
+                        *p++ = s->pal[*data++]; /* this is easy */
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+/* in some situations we might not have to set opacity */
+                        *p++ = /**/ (255<<24)| /**/ (r<<16)|(r<<8)|r;
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
+                for(k=0; k<4; ++k) {
+                    y = *data++;
+/* in some situations we might not have to set opacity */
+                    *p++ = /**/ (255<<24)| /**/
+/* here the cinepak color space excels */
+                           (av_clip_uint8(y + v*2)<<16)|
+                           (av_clip_uint8(y - (u/2) - v)<<8)|
+                            av_clip_uint8(y + u*2);
+                }
+                data += 2;
+            }
+        } else {
+            p += 4;
+        }
+    }
+}
+
+static void cinepak_decode_codebook_rgb24 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
     uint8_t *p;
+    int palette_video = s->palette_video;
+    int selective_update = (chunk_id & 0x01);
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
     flag = 0;
     mask = 0;
 
-    p = codebook[0];
+    p = codebook->rgb24[0];
     for (i=0; i < 256; i++) {
-        if ((chunk_id & 0x01) && !(mask >>= 1)) {
+        if (selective_update && !(mask >>= 1)) {
             if ((data + 4) > eod)
                 break;
 
@@ -97,31 +200,38 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
             mask  = 0x80000000;
         }
 
-        if (!(chunk_id & 0x01) || (flag & mask)) {
+        if (!selective_update || (flag & mask)) {
             int k, kk;
 
             if ((data + n) > eod)
                 break;
 
-            for (k = 0; k < 4; ++k) {
-                int r = *data++;
-                for (kk = 0; kk < 3; ++kk)
-                    *p++ = r;
-            }
-            if (n == 6) {
-                int r, g, b, u, v;
-                u = *(int8_t *)data++;
-                v = *(int8_t *)data++;
-                p -= 12;
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t r = s->pal[*data++];
+                        *p++ = (r>>16)&0xff;
+                        *p++ = (r>>8) &0xff;
+                        *p++ =  r     &0xff;
+                    }
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+                        for (kk = 0; kk < 3; ++kk)
+                            *p++ = r;
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
                 for(k=0; k<4; ++k) {
-                    r = *p++ + v*2;
-                    g = *p++ - (u/2) - v;
-                    b = *p   + u*2;
-                    p -= 2;
-                    *p++ = av_clip_uint8(r);
-                    *p++ = av_clip_uint8(g);
-                    *p++ = av_clip_uint8(b);
+                    y = *data++;
+/* here the cinepak color space excels */
+                    *p++ = av_clip_uint8(y + v*2);
+                    *p++ = av_clip_uint8(y - (u/2) - v);
+                    *p++ = av_clip_uint8(y + u*2);
                 }
+                data += 2;
             }
         } else {
             p += 12;
@@ -129,14 +239,296 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
     }
 }
 
-static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
+static void cinepak_decode_codebook_rgb565 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t *eod = (data + size);
+    uint32_t flag, mask;
+    int      i, n;
+    uint16_t *p;
+    int palette_video = s->palette_video;
+    int selective_update = (chunk_id & 0x01);
+
+    /* check if this chunk contains 4- or 6-element vectors */
+    n    = (chunk_id & 0x04) ? 4 : 6;
+    flag = 0;
+    mask = 0;
+
+    p = codebook->rgb565[0];
+    for (i=0; i < 256; i++) {
+        if (selective_update && !(mask >>= 1)) {
+            if ((data + 4) > eod)
+                break;
+
+            flag  = AV_RB32 (data);
+            data += 4;
+            mask  = 0x80000000;
+        }
+
+        if (!selective_update || (flag & mask)) {
+            int k;
+
+            if ((data + n) > eod)
+                break;
+
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t r = s->pal[*data++];
+                        *p++ = PACK_RGB_RGB565((r>>16)&0xff,
+                                               (r>>8)&0xff,
+                                                r&0xff);
+                    }
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+                        *p++ = PACK_RGB_RGB565(r,r,r);
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
+                for(k=0; k<4; ++k) {
+                    y = *data++;
+/* here the cinepak color space excels */
+                    *p++ = PACK_RGB_RGB565(y + v*2,
+                                           y - (u/2) - v,
+                                           y + u*2);
+                }
+                data += 2;
+            }
+        } else {
+            p += 4;
+        }
+    }
+}
+
+/* a simplistic version to begin with, it is also fast -- rl */
+static void cinepak_decode_codebook_yuv420 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t *eod = (data + size);
+    uint32_t flag, mask;
+    int      i, n;
+    uint8_t *p;
+    int palette_video = s->palette_video;
+    int selective_update = (chunk_id & 0x01);
+
+    /* check if this chunk contains 4- or 6-element vectors */
+    n    = (chunk_id & 0x04) ? 4 : 6;
+    flag = 0;
+    mask = 0;
+
+    p = codebook->yuv420[0];
+    for (i=0; i < 256; i++) {
+        if (selective_update && !(mask >>= 1)) {
+            if ((data + 4) > eod)
+                break;
+
+            flag  = AV_RB32 (data);
+            data += 4;
+            mask  = 0x80000000;
+        }
+
+        if (!selective_update || (flag & mask)) {
+            int k;
+
+            if ((data + n) > eod)
+                break;
+
+            if (n == 4)
+                if (palette_video) {
+/* here we have kind of "more" data than the output format can express */
+                    int r, g, b, u = 0, v = 0;
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t rr = s->pal[*data++];
+                        r = (rr>>16)&0xff;
+                        g = (rr>>8) &0xff;
+                        b =  rr     &0xff;
+/* calculate the components (https://en.wikipedia.org/wiki/YUV) */
+                        *p++ = ((r*66+g*129+b*25+128)>>8)+16;
+                        u += (-r*38-g*74+b*112+128)>>8;
+                        v += (r*112-g*94-b*18+128)>>8;
+                    }
+                    *p++ = (u+2)/4+128;
+                    *p++ = (v+2)/4+128;
+                } else { /* grayscale, easy */
+                    for (k = 0; k < 4; ++k) {
+                        *p++ = *data++;
+                    }
+                    *p++ = 128;
+                    *p++ = 128;
+                }
+            else { /* n == 6 */
+/* here we'd have to handle double format conversion
+ * Cinepak=>rgb24 and then rgb24=>yuv420p, which can not be shortcut;
+ * for the moment just copying as-is, for simplicity and speed,
+ * color will be slightly off but not much */
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++ + 128;
+                *p++ = *data++ + 128;
+            }
+        } else {
+            p += 6;
+        }
+    }
+}
+
+/* here we do not expect anything besides palettized video,
+ * nor check the data for validity, which should be ok
+ * as long as we do not write beyond the bounds */
+static void cinepak_decode_codebook_pal8 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t *eod = (data + size);
+    uint32_t flag, mask;
+    int      i;
+    uint8_t *p;
+    int selective_update = (chunk_id & 0x01);
+
+#define PAL8_VECTOR_LENGTH 4
+/*    av_assert0(chunk_id & 0x04); */
+
+    flag = 0;
+    mask = 0;
+
+    p = codebook->pal8[0];
+    for (i=0; i < 256; i++) {
+        if (selective_update && !(mask >>= 1)) {
+            if ((data + 4) > eod)
+                break;
+
+            flag  = AV_RB32 (data);
+            data += 4;
+            mask  = 0x80000000;
+        }
+
+        if (!selective_update || (flag & mask)) {
+            int k;
+
+            if ((data + PAL8_VECTOR_LENGTH) > eod)
+                break;
+
+            for (k = 0; k < 4; ++k) 
+                *p++ = *data++;
+        } else {
+            p += 4;
+        }
+    }
+}
+
+static int cinepak_decode_vectors_rgb32 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod = (data + size);
+    uint32_t         flag, mask;
+    uint32_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update = (chunk_id & 0x01);
+    int v1_only          = (chunk_id & 0x02);
+
+    flag = 0;
+    mask = 0;
+
+    for (y=strip->y1; y < strip->y2; y+=4) {
+
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*4 + y*s->frame->linesize[0];
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+/* to get the correct picture for not-multiple-of-4 cases let us fill each
+ * block from the bottom up, thus possibly overwriting the bottommost line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
+
+        for (x=strip->x1; x < strip->x2; x+=4) {
+            if (selective_update && !(mask >>= 1)) {
+                if ((data + 4) > eod)
+                    return AVERROR_INVALIDDATA;
+
+                flag  = AV_RB32 (data);
+                data += 4;
+                mask  = 0x80000000;
+            }
+
+            if (!selective_update || (flag & mask)) {
+                if (!v1_only && !(mask >>= 1)) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    flag  = AV_RB32 (data);
+                    data += 4;
+                    mask  = 0x80000000;
+                }
+
+                if (v1_only || (~flag & mask)) {
+                    uint32_t *p;
+                    if (data >= eod)
+                        return AVERROR_INVALIDDATA;
+
+                    p = strip->v1_codebook.rgb32[*data++] + 2; /* ... + 8 */
+                    memcpy(ip3 + 0, p, 4); memcpy(ip3 + 4, p, 4);
+                    memcpy(ip2 + 0, p, 4); memcpy(ip2 + 4, p, 4);
+                    p += 1; /* ... + 12 */
+                    memcpy(ip3 + 8, p, 4); memcpy(ip3 + 12, p, 4);
+                    memcpy(ip2 + 8, p, 4); memcpy(ip2 + 12, p, 4);
+                    p -= 3; /* ... + 0 */
+                    memcpy(ip1 + 0, p, 4); memcpy(ip1 + 4, p, 4);
+                    memcpy(ip0 + 0, p, 4); memcpy(ip0 + 4, p, 4);
+                    p += 1; /* ... + 4 */
+                    memcpy(ip1 + 8, p, 4); memcpy(ip1 + 12, p, 4);
+                    memcpy(ip0 + 8, p, 4); memcpy(ip0 + 12, p, 4);
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.rgb32[*data++];
+                    cb1 = strip->v4_codebook.rgb32[*data++];
+                    cb2 = strip->v4_codebook.rgb32[*data++];
+                    cb3 = strip->v4_codebook.rgb32[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 8);
+                    memcpy(ip3 + 8, cb3 + 2, 8);
+                    memcpy(ip2 + 0, cb2 + 0, 8);
+                    memcpy(ip2 + 8, cb3 + 0, 8);
+                    memcpy(ip1 + 0, cb0 + 2, 8);
+                    memcpy(ip1 + 8, cb1 + 2, 8);
+                    memcpy(ip0 + 0, cb0 + 0, 8);
+                    memcpy(ip0 + 8, cb1 + 0, 8);
+
+                }
+            }
+
+            ip0 += 16;  ip1 += 16;
+            ip2 += 16;  ip3 += 16;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_rgb24 (CinepakContext *s, cvid_strip *strip,
                                    int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
     uint8_t         *cb0, *cb1, *cb2, *cb3;
-    int             x, y;
+    int              x, y;
     char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update = (chunk_id & 0x01);
+    int v1_only          = (chunk_id & 0x02);
 
     flag = 0;
     mask = 0;
@@ -145,7 +537,7 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
 
 /* take care of y dimension not being multiple of 4, such streams exist */
         ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
-          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+                                strip->x1*3 + y*s->frame->linesize[0];
         if(s->avctx->height - y > 1) {
             ip1 = ip0 + s->frame->linesize[0];
             if(s->avctx->height - y > 2) {
@@ -161,7 +553,7 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
  * (instead of in-loop checking) */
 
         for (x=strip->x1; x < strip->x2; x+=4) {
-            if ((chunk_id & 0x01) && !(mask >>= 1)) {
+            if (selective_update && !(mask >>= 1)) {
                 if ((data + 4) > eod)
                     return AVERROR_INVALIDDATA;
 
@@ -170,8 +562,8 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                 mask  = 0x80000000;
             }
 
-            if (!(chunk_id & 0x01) || (flag & mask)) {
-                if (!(chunk_id & 0x02) && !(mask >>= 1)) {
+            if (!selective_update || (flag & mask)) {
+                if (!v1_only && !(mask >>= 1)) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
@@ -180,84 +572,357 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                     mask  = 0x80000000;
                 }
 
-                if ((chunk_id & 0x02) || (~flag & mask)) {
+                if (v1_only || (~flag & mask)) {
                     uint8_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
-                    p = strip->v1_codebook[*data++];
-                    if (s->palette_video) {
-                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
-                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
-                        ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
-                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
-                    } else {
-                        p += 6;
-                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
-                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
-                        p += 3; /* ... + 9 */
-                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
-                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
-                        p -= 9; /* ... + 0 */
-                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
-                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
-                        p += 3; /* ... + 3 */
-                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
-                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
-                    }
+                    p = strip->v1_codebook.rgb24[*data++] + 6;
+                    memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
+                    memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
+                    p += 3; /* ... + 9 */
+                    memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
+                    memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
+                    p -= 9; /* ... + 0 */
+                    memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
+                    memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
+                    p += 3; /* ... + 3 */
+                    memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
+                    memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
 
                 } else if (flag & mask) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    cb0 = strip->v4_codebook[*data++];
-                    cb1 = strip->v4_codebook[*data++];
-                    cb2 = strip->v4_codebook[*data++];
-                    cb3 = strip->v4_codebook[*data++];
-                    if (s->palette_video) {
-                        uint8_t *p;
-                        p = ip3;
-                        *p++ = cb2[6];
-                        *p++ = cb2[9];
-                        *p++ = cb3[6];
-                        *p   = cb3[9];
-                        p = ip2;
-                        *p++ = cb2[0];
-                        *p++ = cb2[3];
-                        *p++ = cb3[0];
-                        *p   = cb3[3];
-                        p = ip1;
-                        *p++ = cb0[6];
-                        *p++ = cb0[9];
-                        *p++ = cb1[6];
-                        *p   = cb1[9];
-                        p = ip0;
-                        *p++ = cb0[0];
-                        *p++ = cb0[3];
-                        *p++ = cb1[0];
-                        *p   = cb1[3];
-                    } else {
-                        memcpy(ip3 + 0, cb2 + 6, 6);
-                        memcpy(ip3 + 6, cb3 + 6, 6);
-                        memcpy(ip2 + 0, cb2 + 0, 6);
-                        memcpy(ip2 + 6, cb3 + 0, 6);
-                        memcpy(ip1 + 0, cb0 + 6, 6);
-                        memcpy(ip1 + 6, cb1 + 6, 6);
-                        memcpy(ip0 + 0, cb0 + 0, 6);
-                        memcpy(ip0 + 6, cb1 + 0, 6);
-                    }
+                    cb0 = strip->v4_codebook.rgb24[*data++];
+                    cb1 = strip->v4_codebook.rgb24[*data++];
+                    cb2 = strip->v4_codebook.rgb24[*data++];
+                    cb3 = strip->v4_codebook.rgb24[*data++];
+                    memcpy(ip3 + 0, cb2 + 6, 6);
+                    memcpy(ip3 + 6, cb3 + 6, 6);
+                    memcpy(ip2 + 0, cb2 + 0, 6);
+                    memcpy(ip2 + 6, cb3 + 0, 6);
+                    memcpy(ip1 + 0, cb0 + 6, 6);
+                    memcpy(ip1 + 6, cb1 + 6, 6);
+                    memcpy(ip0 + 0, cb0 + 0, 6);
+                    memcpy(ip0 + 6, cb1 + 0, 6);
 
                 }
             }
 
-            if (s->palette_video) {
-                ip0 += 4;  ip1 += 4;
-                ip2 += 4;  ip3 += 4;
-            } else {
-                ip0 += 12;  ip1 += 12;
-                ip2 += 12;  ip3 += 12;
+            ip0 += 12;  ip1 += 12;
+            ip2 += 12;  ip3 += 12;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_rgb565 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod = (data + size);
+    uint32_t         flag, mask;
+    uint16_t        *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update = (chunk_id & 0x01);
+    int v1_only          = (chunk_id & 0x02);
+
+    flag = 0;
+    mask = 0;
+
+    for (y=strip->y1; y < strip->y2; y+=4) {
+
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*2 + y*s->frame->linesize[0];
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
             }
         }
+/* to get the correct picture for not-multiple-of-4 cases let us fill each
+ * block from the bottom up, thus possibly overwriting the bottommost line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
+
+        for (x=strip->x1; x < strip->x2; x+=4) {
+            if (selective_update && !(mask >>= 1)) {
+                if ((data + 4) > eod)
+                    return AVERROR_INVALIDDATA;
+
+                flag  = AV_RB32 (data);
+                data += 4;
+                mask  = 0x80000000;
+            }
+
+            if (!selective_update || (flag & mask)) {
+                if (!v1_only && !(mask >>= 1)) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    flag  = AV_RB32 (data);
+                    data += 4;
+                    mask  = 0x80000000;
+                }
+
+                if (v1_only || (~flag & mask)) {
+                    uint16_t *p;
+                    if (data >= eod)
+                        return AVERROR_INVALIDDATA;
+
+                    p = strip->v1_codebook.rgb565[*data++];
+                    * (uint16_t *)ip3    = *((uint16_t *)ip3+1) =
+                    * (uint16_t *)ip2    = *((uint16_t *)ip2+1) = p[2];
+                    *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) =
+                    *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3];
+                    * (uint16_t *)ip1    = *((uint16_t *)ip1+1) =
+                    * (uint16_t *)ip0    = *((uint16_t *)ip0+1) = p[0];
+                    *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) =
+                    *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1];
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.rgb565[*data++];
+                    cb1 = strip->v4_codebook.rgb565[*data++];
+                    cb2 = strip->v4_codebook.rgb565[*data++];
+                    cb3 = strip->v4_codebook.rgb565[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 4);
+                    memcpy(ip3 + 4, cb3 + 2, 4);
+                    memcpy(ip2 + 0, cb2 + 0, 4);
+                    memcpy(ip2 + 4, cb3 + 0, 4);
+                    memcpy(ip1 + 0, cb0 + 2, 4);
+                    memcpy(ip1 + 4, cb1 + 2, 4);
+                    memcpy(ip0 + 0, cb0 + 0, 4);
+                    memcpy(ip0 + 4, cb1 + 0, 4);
+
+                }
+            }
+
+            ip0 += 8;  ip1 += 8;
+            ip2 += 8;  ip3 += 8;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_yuv420 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod = (data + size);
+    uint32_t         flag, mask;
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3,
+                    *up01, *up23, *vp01, *vp23;
+    int selective_update = (chunk_id & 0x01);
+    int v1_only          = (chunk_id & 0x02);
+
+    flag = 0;
+    mask = 0;
+
+    for (y=strip->y1; y < strip->y2; y+=4) {
+
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*3 + y*s->frame->linesize[0];
+        up01 = up23 = s->frame->data[1] + strip->x1 + y/2*s->frame->linesize[1];
+        vp01 = vp23 = s->frame->data[2] + strip->x1 + y/2*s->frame->linesize[2];
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                up23 = up01 + s->frame->linesize[1];
+                vp23 = vp01 + s->frame->linesize[2];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+
+/* to get the correct picture for not-multiple-of-4 cases let us fill each
+ * block from the bottom up, thus possibly overwriting the bottommost line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
+
+        for (x=strip->x1; x < strip->x2; x+=4) {
+            if (selective_update && !(mask >>= 1)) {
+                if ((data + 4) > eod)
+                    return AVERROR_INVALIDDATA;
+
+                flag  = AV_RB32 (data);
+                data += 4;
+                mask  = 0x80000000;
+            }
+
+            if (!selective_update || (flag & mask)) {
+                if (!v1_only && !(mask >>= 1)) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    flag  = AV_RB32 (data);
+                    data += 4;
+                    mask  = 0x80000000;
+                }
+
+                if (v1_only || (~flag & mask)) {
+                    uint8_t *p;
+                    if (data >= eod)
+                        return AVERROR_INVALIDDATA;
+
+                    p = strip->v1_codebook.yuv420[*data++];
+                    ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                    ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
+                    ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                    ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
+                    p += 4;
+                    up01[0] = up01[1] = up23[0] = up23[1] = *p++;
+                    vp01[0] = vp01[1] = vp23[0] = vp23[1] = *p++;
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.yuv420[*data++];
+                    cb1 = strip->v4_codebook.yuv420[*data++];
+                    cb2 = strip->v4_codebook.yuv420[*data++];
+                    cb3 = strip->v4_codebook.yuv420[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 2);
+                    memcpy(ip3 + 2, cb3 + 2, 2);
+                    memcpy(ip2 + 0, cb2 + 0, 2);
+                    memcpy(ip2 + 2, cb3 + 0, 2);
+                    memcpy(ip1 + 0, cb0 + 2, 2);
+                    memcpy(ip1 + 2, cb1 + 2, 2);
+                    memcpy(ip0 + 0, cb0 + 0, 2);
+                    memcpy(ip0 + 2, cb1 + 0, 2);
+                    cb0 += 4; cb1 += 4; cb2 += 4; cb3 += 4;
+                    up23[0] = *cb2++; vp23[0] = *cb2;
+                    up23[1] = *cb3++; vp23[1] = *cb3;
+                    up01[0] = *cb0++; vp01[0] = *cb0;
+                    up01[1] = *cb1++; vp01[1] = *cb1;
+
+                }
+            }
+
+            ip0 += 4;  ip1 += 4;
+            ip2 += 4;  ip3 += 4;
+            up01 += 2; up23 += 2;
+            vp01 += 2; vp23 += 2;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_pal8 (CinepakContext *s, cvid_strip *strip,
+                                 int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod = (data + size);
+    uint32_t         flag, mask;
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update = (chunk_id & 0x01);
+    int v1_only          = (chunk_id & 0x02);
+
+    flag = 0;
+    mask = 0;
+
+    for (y=strip->y1; y < strip->y2; y+=4) {
+
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1 + y*s->frame->linesize[0];
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+
+/* to get the correct picture for not-multiple-of-4 cases let us fill each
+ * block from the bottom up, thus possibly overwriting the bottommost line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
+
+        for (x=strip->x1; x < strip->x2; x+=4) {
+            if (selective_update && !(mask >>= 1)) {
+                if ((data + 4) > eod)
+                    return AVERROR_INVALIDDATA;
+
+                flag  = AV_RB32 (data);
+                data += 4;
+                mask  = 0x80000000;
+            }
+
+            if (!selective_update || (flag & mask)) {
+                if (!v1_only && !(mask >>= 1)) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    flag  = AV_RB32 (data);
+                    data += 4;
+                    mask  = 0x80000000;
+                }
+
+                if (v1_only || (~flag & mask)) {
+                    uint8_t *p;
+                    if (data >= eod)
+                        return AVERROR_INVALIDDATA;
+
+                    p = strip->v1_codebook.pal8[*data++];
+                    ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                    ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
+                    ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                    ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
+
+                } else if (flag & mask) {
+                    uint8_t *p;
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.pal8[*data++];
+                    cb1 = strip->v4_codebook.pal8[*data++];
+                    cb2 = strip->v4_codebook.pal8[*data++];
+                    cb3 = strip->v4_codebook.pal8[*data++];
+                    p = ip3;
+                    *p++ = cb2[2];
+                    *p++ = cb2[3];
+                    *p++ = cb3[2];
+                    *p   = cb3[3];
+                    p = ip2;
+                    *p++ = cb2[0];
+                    *p++ = cb2[1];
+                    *p++ = cb3[0];
+                    *p   = cb3[1];
+                    p = ip1;
+                    *p++ = cb0[2];
+                    *p++ = cb0[3];
+                    *p++ = cb1[2];
+                    *p   = cb1[3];
+                    p = ip0;
+                    *p++ = cb0[0];
+                    *p++ = cb0[1];
+                    *p++ = cb1[0];
+                    *p   = cb1[1];
+
+                }
+            }
+
+            ip0 += 4;  ip1 += 4;
+            ip2 += 4;  ip3 += 4;
+        }
     }
 
     return 0;
@@ -290,22 +955,22 @@ static int cinepak_decode_strip (CinepakContext *s,
         case 0x21:
         case 0x24:
         case 0x25:
-            cinepak_decode_codebook (strip->v4_codebook, chunk_id,
-                chunk_size, data);
+            s->decode_codebook(s, &strip->v4_codebook,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x22:
         case 0x23:
         case 0x26:
         case 0x27:
-            cinepak_decode_codebook (strip->v1_codebook, chunk_id,
-                chunk_size, data);
+            s->decode_codebook (s, &strip->v1_codebook,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x30:
         case 0x31:
         case 0x32:
-            return cinepak_decode_vectors (s, strip, chunk_id,
+            return s->decode_vectors (s, strip, chunk_id,
                 chunk_size, data);
         }
 
@@ -385,9 +1050,9 @@ static int cinepak_decode (CinepakContext *s)
         strip_size = ((s->data + strip_size) > eod) ? (eod - s->data) : strip_size;
 
         if ((i > 0) && !(frame_flags & 0x01)) {
-            memcpy (s->strips[i].v4_codebook, s->strips[i-1].v4_codebook,
+            memcpy (&s->strips[i].v4_codebook, &s->strips[i-1].v4_codebook,
                 sizeof(s->strips[i].v4_codebook));
-            memcpy (s->strips[i].v1_codebook, s->strips[i-1].v1_codebook,
+            memcpy (&s->strips[i].v1_codebook, &s->strips[i-1].v1_codebook,
                 sizeof(s->strips[i].v1_codebook));
         }
 
@@ -402,9 +1067,24 @@ static int cinepak_decode (CinepakContext *s)
     return 0;
 }
 
+static const enum AVPixelFormat ff_cinepak_pixfmt_list[] = {
+    AV_PIX_FMT_RGB24,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_RGB565,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_PAL8, /* only when input is palettized */
+    AV_PIX_FMT_NONE
+};
+
 static av_cold int cinepak_decode_init(AVCodecContext *avctx)
 {
     CinepakContext *s = avctx->priv_data;
+#ifdef CINEPAK_OUTPUT_FORMAT_OVERRIDE
+    char *out_fmt_override = getenv("CINEPAK_OUTPUT_FORMAT_OVERRIDE");
+#endif
+
+/* we take advantage of VQ to efficiently support
+ * multiple output formats */
 
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
@@ -412,13 +1092,64 @@ static av_cold int cinepak_decode_init(AVCodecContext *avctx)
 
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
-    // check for paletted data
-    if (avctx->bits_per_coded_sample != 8) {
-        s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_RGB24;
-    } else {
-        s->palette_video = 1;
-        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    /* check for paletted data */
+    s->palette_video = (avctx->bits_per_coded_sample == 8);
+    av_log(avctx, AV_LOG_INFO, "this is %sa palette video\n", s->palette_video?"":"not ");
+
+#ifdef CINEPAK_OUTPUT_FORMAT_OVERRIDE
+    if (out_fmt_override && *out_fmt_override) {
+        if (       !strcmp(out_fmt_override, "rgb32")) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        } else if (!strcmp(out_fmt_override, "rgb24")) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if (!strcmp(out_fmt_override, "rgb565")) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB565;
+        } else if (!strcmp(out_fmt_override, "yuv420p")) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        } else if (!strcmp(out_fmt_override, "pal8")) {
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format override '%s'\n",
+                                        out_fmt_override);
+            return AVERROR(EINVAL);
+        }
+    } else
+#endif
+    avctx->pix_fmt = s->out_pixfmt;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGB32:
+        av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb32\n");
+        s->decode_codebook = cinepak_decode_codebook_rgb32;
+        s->decode_vectors  = cinepak_decode_vectors_rgb32;
+        break;
+    case AV_PIX_FMT_RGB24:
+        av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb24\n");
+        s->decode_codebook = cinepak_decode_codebook_rgb24;
+        s->decode_vectors  = cinepak_decode_vectors_rgb24;
+        break;
+    case AV_PIX_FMT_RGB565:
+        av_log(avctx, AV_LOG_INFO, "Codec output pixel format is rgb565\n");
+        s->decode_codebook = cinepak_decode_codebook_rgb565;
+        s->decode_vectors  = cinepak_decode_vectors_rgb565;
+        break;
+    case AV_PIX_FMT_YUV420P:
+        av_log(avctx, AV_LOG_INFO, "Codec output pixel format is approximate yuv420p\n");
+        s->decode_codebook = cinepak_decode_codebook_yuv420;
+        s->decode_vectors  = cinepak_decode_vectors_yuv420;
+        break;
+    case AV_PIX_FMT_PAL8:
+        if (!s->palette_video) {
+            av_log(avctx, AV_LOG_ERROR, "Palettized output not supported without palettized input\n");
+            return AVERROR(EINVAL);
+        }
+        av_log(avctx, AV_LOG_INFO, "Codec output pixel format is pal8\n");
+        s->decode_codebook = cinepak_decode_codebook_pal8;
+        s->decode_vectors  = cinepak_decode_vectors_pal8;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format %d\n", avctx->pix_fmt);
+        return AVERROR(EINVAL);
     }
 
     s->frame = av_frame_alloc();
@@ -457,7 +1188,7 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n");
     }
 
-    if (s->palette_video)
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
         memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE);
 
     if ((ret = av_frame_ref(data, s->frame)) < 0)
@@ -488,4 +1219,6 @@ AVCodec ff_cinepak_decoder = {
     .close          = cinepak_decode_end,
     .decode         = cinepak_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .pix_fmts       = ff_cinepak_pixfmt_list,
+    .priv_class     = &cinepak_class,
 };
-- 
2.11.0



More information about the ffmpeg-devel mailing list