[FFmpeg-devel] [PATCH 1/2] lavu: add text_file API.

wm4 nfxjfg at googlemail.com
Thu Aug 8 14:26:15 CEST 2013


On Thu,  8 Aug 2013 14:08:43 +0200
Nicolas George <nicolas.george at normalesup.org> wrote:

> TODO: version bump, APIChanges entry, !HAVE_ICONV path.
> 
> Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> ---

> +static const char *const default_encodings[] = {
> +    "UTF-8",
> +    "US-ASCII",
> +    "WINDOWS-1252",
> +    "ISO-8859-1",
> +    NULL
> +};
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> +    iconv_t cd;
> +    AVBPrint bp;
> +    char *inbuf, *outbuf, *recoded;
> +    size_t insize, outsize, insize_orig;
> +    unsigned outsize_int;
> +    int ret = 0;
> +
> +    if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
> +        return AVERROR(errno);
> +    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> +    inbuf  = tf->full_data;
> +    insize = tf->full_data_size;
> +    while (insize) {
> +        av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
> +        if (outsize_int <= 1) {
> +            ret = AVERROR(ENOMEM);
> +            break;
> +        }
> +        outsize_int--;
> +        outsize = outsize_int;
> +        insize_orig = insize;
> +        iconv(cd, &inbuf, &insize, &outbuf, &outsize);
> +        if (insize == insize_orig) {
> +            ret = AVERROR_INVALIDDATA;
> +            break;
> +        }
> +        bp.len += outsize_int - outsize;
> +    }
> +    iconv_close(cd);
> +    if (ret < 0) {
> +        av_bprint_finalize(&bp, NULL);
> +        return ret;
> +    }
> +    av_assert1(!insize);
> +    bp.str[bp.len] = 0;
> +    if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
> +        return ret;
> +    av_free(tf->full_data);
> +    tf->full_data      = recoded;
> +    tf->full_data_size = bp.len;
> +    tf->encoding       = encoding;
> +    return 0;
> +}
> +
> +static int guess_encoding(AVTextFile *tf)
> +{
> +    const char *bom_encoding[2] = { NULL, NULL };
> +    const char *const *encodings;
> +    int ret, i;
> +
> +    encodings = tf->encodings;
> +    if (!encodings) {
> +        for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
> +            if (!memcmp(tf->full_data, byte_order_marks[i].bom,
> +                                       byte_order_marks[i].len)) {
> +                encodings = bom_encoding;
> +                bom_encoding[0] = byte_order_marks[i].encoding;
> +                break;
> +            }
> +        }
> +        if (!encodings)
> +            encodings = default_encodings;
> +    }
> +
> +    for (i = 0; encodings[i]; i++)
> +        if ((ret = try_encoding(tf, encodings[i])) >= 0)
> +            return ret;
> +
> +    av_strlcpy(tf->error, "Unable to guess character encoding",
> +               sizeof(tf->error));
> +    return AVERROR_INVALIDDATA;
> +}

I assume this is for subtitle support.

There are so many libraries which try to auto-detect encodings using
elaborate statistical methods etc., and they all fail sometimes in one
way or another - and this is supposed to be sufficient? How do you even
distinguish these 8-bit codepage encodings?

How can an application do its own auto-detection?


More information about the ffmpeg-devel mailing list