[FFmpeg-devel] [PATCH] lavc: support subtitles charset conversion.
Nicolas George
nicolas.george at normalesup.org
Mon Dec 31 11:37:33 CET 2012
Le primidi 11 nivôse, an CCXXI, Clement Boesch a écrit :
> ---
> Changelog | 1 +
> configure | 2 ++
> libavcodec/avcodec.h | 7 +++++
> libavcodec/options_table.h | 1 +
> libavcodec/utils.c | 65 +++++++++++++++++++++++++++++++++++++++++++++-
> 5 files changed, 75 insertions(+), 1 deletion(-)
I am sorry, but this will not work. I have on occasion found srt files in
UCS-2/UTF-16 (with byte order mark, so totally recognizable). Since this
encoding is not compatible with ASCII, recoding after the subtitle has been
stored in the AVSubtitleRect.ass field is not possible.
Of course, currently these files just fail to parse, but we will probably
want to change that at some point¹. Also, I suspect there are a few muxed
files with S_TEXT or equivalent streams in ASCII-incompatible encodings.
I am not saying that this patch should be completely trashed, but it needs a
lot more of forethought.
1: it might be pretty easy, in fact: introduce and use ff_read_text_file
that returns an array of lines and does all the autodetection and recoding.
> diff --git a/Changelog b/Changelog
> index b0a0de5..f2bc343 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -52,6 +52,7 @@ version <next>:
> - NIST Sphere demuxer
> - MPL2, VPlayer, MPlayer, AQTitle, PJS and SubViewer v1 subtitles demuxers and decoders
> - Sony Wave64 muxer
> +- Subtitles charset conversion
AFAIK, "charset" is invalid. Technically, UTF-8 and UCS-4 are the same
character *set*, Unicode, and they only differ in their byte encoding; the
correct name would be "character encoding", but this is a bit long.
(For searchability, "charset" should probably be present in the docs,
though, along with "codepage" for microsoft-addicts.)
>
>
> version 1.0:
> diff --git a/configure b/configure
> index faea887..0360fc9 100755
> --- a/configure
> +++ b/configure
> @@ -1380,6 +1380,7 @@ HAVE_LIST="
> glob
> gnu_as
> ibm_asm
> + iconv
> inet_aton
> io_h
> isatty
> @@ -3695,6 +3696,7 @@ check_func getopt
> check_func getrusage
> check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
> check_func gettimeofday
> +check_func iconv
> check_func inet_aton $network_extralibs
> check_func isatty
> check_func localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 012a31c..7a3761d 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3188,6 +3188,13 @@ typedef struct AVCodecContext {
> * - encoding: unused
> */
> AVDictionary *metadata;
> +
> + /**
> + * Charset of the input subtitles file.
> + * - decoding: set by user
> + * - encoding: unused
> + */
> + char *sub_charset;
> } AVCodecContext;
>
> AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 2a31fa6..38d493d 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -403,6 +403,7 @@ static const AVOption options[]={
> {"ka", "Karaoke", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE }, INT_MIN, INT_MAX, A|E, "audio_service_type"},
> {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
> {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charset", "set input text subtitles charset", OFFSET(sub_charset), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
> {NULL},
> };
>
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 918954a..a9a5168 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -47,6 +47,9 @@
> #include <stdarg.h>
> #include <limits.h>
> #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>
> volatile int ff_avcodec_locked;
> static int volatile entangled_thread_counter = 0;
> @@ -1058,6 +1061,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
> ret = AVERROR(EINVAL);
> goto free_and_end;
> }
> + if (!HAVE_ICONV && avctx->codec_type == AVMEDIA_TYPE_SUBTITLE && avctx->sub_charset) {
> + av_log(avctx, AV_LOG_ERROR, "Charset subtitles conversion needs "
> + "a libavcodec built with iconv support.\n");
> + ret = AVERROR(EINVAL);
> + goto free_and_end;
> + }
> }
> end:
> ff_unlock_avcodec();
> @@ -1816,6 +1825,56 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
> return ret;
> }
>
> +#if HAVE_ICONV
> +static int recode_subtitle(AVCodecContext *avctx, AVSubtitle *sub)
> +{
> + int i, ret = 0;
> + iconv_t cd;
> +
> + if (!avctx->sub_charset || sub->format == 0)
> + return 0;
> +
> + cd = iconv_open("UTF-8", avctx->sub_charset);
> + if (cd == (iconv_t)-1) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> + "with input charset \"%s\"\n", avctx->sub_charset);
> + return AVERROR_EXTERNAL;
> + }
Opening a new iconv for each event may be quite expensive.
> +
> + for (i = 0; i < sub->num_rects; i++) {
> + char *inb, *outb, *inb_start, *outb_start;
> + size_t inl, outl;
> +
> + inb = inb_start = sub->rects[i]->ass;
> + if (!inb)
> + break;
> + inl = strlen(inb);
> + outl = 4*inl;
Possible overflow (but very unlikely).
Also, what is that 4*? UTF-8 can use up to 6 bytes per codepoint. Enlarging
the buffer in case of overflow may be AVERROR_PATCHWELCOME.
> + outb = outb_start = av_malloc(outl + 1);
> + if (!outb) {
> + ret = AVERROR(ENOMEM);
> + break;
> + }
> +
> + if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> + iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> + "from %s to UTF-8\n", inb_start, avctx->sub_charset);
> + av_free(outb_start);
> + ret = AVERROR_EXTERNAL;
AVERROR(errno)?
> + break;
> + }
> + *outb = 0;
> +
> + av_free(inb_start);
> + sub->rects[i]->ass = outb_start;
> + }
> +
> + iconv_close(cd);
> + return ret;
> +}
> +#endif
> +
> int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> int *got_sub_ptr,
> AVPacket *avpkt)
> @@ -1850,8 +1909,12 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> ret = avpkt->size;
> }
>
> - if (*got_sub_ptr)
> + if (*got_sub_ptr) {
> +#if HAVE_ICONV
> + ret = recode_subtitle(avctx, sub);
> +#endif
> avctx->frame_number++;
> + }
> }
>
> return ret;
Regards,
--
Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20121231/78cd1f84/attachment.asc>
More information about the ffmpeg-devel
mailing list