[FFmpeg-devel] [PATCH] lavc: support subtitles charset conversion.

Nicolas George nicolas.george at normalesup.org
Mon Dec 31 11:37:33 CET 2012


Le primidi 11 nivôse, an CCXXI, Clement Boesch a écrit :
> ---
>  Changelog                  |  1 +
>  configure                  |  2 ++
>  libavcodec/avcodec.h       |  7 +++++
>  libavcodec/options_table.h |  1 +
>  libavcodec/utils.c         | 65 +++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 75 insertions(+), 1 deletion(-)

I am sorry, but this will not work. I have on occasion found srt files in
UCS-2/UTF-16 (with byte order mark, so totally recognizable). Since this
encoding is not compatible with ASCII, recoding after the subtitle has been
stored in the AVSubtitleRect.ass field is not possible.

Of course, currently these files just fail to parse, but we will probably
want to change that at some point¹. Also, I suspect there are a few muxed
files with S_TEXT or equivalent streams in ASCII-incompatible encodings.

I am not saying that this patch should be completely trashed, but it needs a
lot more of forethought.


1: it might be pretty easy, in fact: introduce and use ff_read_text_file
that returns an array of lines and does all the autodetection and recoding.

> diff --git a/Changelog b/Changelog
> index b0a0de5..f2bc343 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -52,6 +52,7 @@ version <next>:
>  - NIST Sphere demuxer
>  - MPL2, VPlayer, MPlayer, AQTitle, PJS and SubViewer v1 subtitles demuxers and decoders
>  - Sony Wave64 muxer

> +- Subtitles charset conversion

AFAIK, "charset" is invalid. Technically, UTF-8 and UCS-4 are the same
character *set*, Unicode, and they only differ in their byte encoding; the
correct name would be "character encoding", but this is a bit long.

(For searchability, "charset" should probably be present in the docs,
though, along with "codepage" for microsoft-addicts.)

>  
>  
>  version 1.0:
> diff --git a/configure b/configure
> index faea887..0360fc9 100755
> --- a/configure
> +++ b/configure
> @@ -1380,6 +1380,7 @@ HAVE_LIST="
>      glob
>      gnu_as
>      ibm_asm
> +    iconv
>      inet_aton
>      io_h
>      isatty
> @@ -3695,6 +3696,7 @@ check_func  getopt
>  check_func  getrusage
>  check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
>  check_func  gettimeofday
> +check_func  iconv
>  check_func  inet_aton $network_extralibs
>  check_func  isatty
>  check_func  localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 012a31c..7a3761d 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3188,6 +3188,13 @@ typedef struct AVCodecContext {
>       * - encoding: unused
>       */
>      AVDictionary *metadata;
> +
> +    /**
> +     * Charset of the input subtitles file.
> +     * - decoding: set by user
> +     * - encoding: unused
> +     */
> +    char *sub_charset;
>  } AVCodecContext;
>  
>  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 2a31fa6..38d493d 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -403,6 +403,7 @@ static const AVOption options[]={
>  {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
>  {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
>  {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charset", "set input text subtitles charset", OFFSET(sub_charset), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
>  {NULL},
>  };
>  
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 918954a..a9a5168 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -47,6 +47,9 @@
>  #include <stdarg.h>
>  #include <limits.h>
>  #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>  
>  volatile int ff_avcodec_locked;
>  static int volatile entangled_thread_counter = 0;
> @@ -1058,6 +1061,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
>              ret = AVERROR(EINVAL);
>              goto free_and_end;
>          }
> +        if (!HAVE_ICONV && avctx->codec_type == AVMEDIA_TYPE_SUBTITLE && avctx->sub_charset) {
> +            av_log(avctx, AV_LOG_ERROR, "Charset subtitles conversion needs "
> +                   "a libavcodec built with iconv support.\n");
> +            ret = AVERROR(EINVAL);
> +            goto free_and_end;
> +        }
>      }
>  end:
>      ff_unlock_avcodec();
> @@ -1816,6 +1825,56 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
>      return ret;
>  }
>  
> +#if HAVE_ICONV
> +static int recode_subtitle(AVCodecContext *avctx, AVSubtitle *sub)
> +{
> +    int i, ret = 0;
> +    iconv_t cd;
> +
> +    if (!avctx->sub_charset || sub->format == 0)
> +        return 0;
> +

> +    cd = iconv_open("UTF-8", avctx->sub_charset);
> +    if (cd == (iconv_t)-1) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> +               "with input charset \"%s\"\n", avctx->sub_charset);
> +        return AVERROR_EXTERNAL;
> +    }

Opening a new iconv for each event may be quite expensive.

> +
> +    for (i = 0; i < sub->num_rects; i++) {
> +        char *inb, *outb, *inb_start, *outb_start;
> +        size_t inl, outl;
> +
> +        inb = inb_start = sub->rects[i]->ass;
> +        if (!inb)
> +            break;

> +        inl  = strlen(inb);
> +        outl = 4*inl;

Possible overflow (but very unlikely).

Also, what is that 4*? UTF-8 can use up to 6 bytes per codepoint. Enlarging
the buffer in case of overflow may be AVERROR_PATCHWELCOME.

> +        outb = outb_start = av_malloc(outl + 1);
> +        if (!outb) {
> +            ret = AVERROR(ENOMEM);
> +            break;
> +        }
> +
> +        if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> +            iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1) {
> +            av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> +                   "from %s to UTF-8\n", inb_start, avctx->sub_charset);
> +            av_free(outb_start);

> +            ret = AVERROR_EXTERNAL;

AVERROR(errno)?

> +            break;
> +        }
> +        *outb = 0;
> +
> +        av_free(inb_start);
> +        sub->rects[i]->ass = outb_start;
> +    }
> +
> +    iconv_close(cd);
> +    return ret;
> +}
> +#endif
> +
>  int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>                               int *got_sub_ptr,
>                               AVPacket *avpkt)
> @@ -1850,8 +1909,12 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>                  ret = avpkt->size;
>          }
>  
> -        if (*got_sub_ptr)
> +        if (*got_sub_ptr) {
> +#if HAVE_ICONV
> +            ret = recode_subtitle(avctx, sub);
> +#endif
>              avctx->frame_number++;
> +        }
>      }
>  
>      return ret;

Regards,

-- 
  Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20121231/78cd1f84/attachment.asc>


More information about the ffmpeg-devel mailing list