[FFmpeg-devel] [PATCH] lavc: support subtitles charset conversion.

Fri Jan 4 06:24:16 CET 2013

On Thu, Jan 03, 2013 at 04:53:02PM +0100, Nicolas George wrote:
[...]
> > >     AV_TEXT_ENCODING_MODE_POST,    //< lavc must recode the decoded text
> > That sounds like the perfect place ;)
> 
> Actually, no: the more I think about it, the more I believe that PRE is way
> better than POST. Unless I am mistaken, recoding before the decoder would
> work just as well for all current codecs an ASCII-compatible encodings, plus
> it will work with non-ASCII-compatible encodings just as well.
> 
> The only situation where POST would be better than PRE seems to be if the
> codec mixes binary and text data (for example: U16BE line_length;
> U8 line[line_length]; repeat). And we do not have those currently.
> 
> > Except that it doesn't contain the buffer size, so it can only do ASCII
> > compliant charset conversions.
> 
> The problem is not just the buffer size: the decoders produce ASS markup in
> ASCII, so, for example, a line break, "\N" would become U+5C4E
> (non-existent) in UTF-16-BE (depending on the parity of the text before).
> 

Indeed, you're right. New patch attached with pre-mode.

> > Note: inside the demuxer, you don't have access to the codec charset
> > (options are not yet populated). Inside the decoder that's possible.
> 
> The same problem happens for the rawvideo or pcm demuxers: the frame size or
> sample rate is not available. This is solved using a private option with a
> standardized name.
> 
> > I must say I have a hard time following what you actually want me to do.
> > Can you tell me more about what you want to want to expose to the user
> > first?
> 
> Do you mean API user or command-line tool user?
> 

I was wondering what in my patch was preventing any code evolution, so my
question was about any interface with the user.

Now that the character re-encoding is done before decoding, do you see any
design problem with it?

Note about the FIXME in the patch: we should add some kind of flags to the
codec because the recoding will obviously have unwanted behaviour with
bitmap sub packets.

-- 
Clément B.
-------------- next part --------------
From 9f5abea45ed8750ca2cee384b65749ec64f422f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <ubitux at gmail.com>
Date: Mon, 31 Dec 2012 10:05:25 +0100
Subject: [PATCH 1/5] lavc: support subtitles recoding.

TODO: bump lavc minor
---
 Changelog                  |  1 +
 configure                  |  2 ++
 libavcodec/avcodec.h       |  7 ++++
 libavcodec/options_table.h |  1 +
 libavcodec/utils.c         | 80 ++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/Changelog b/Changelog
index 7bea6af..0933df6 100644
--- a/Changelog
+++ b/Changelog
@@ -54,6 +54,7 @@ version <next>:
 - Sony Wave64 muxer
 - adobe and limelight publisher authentication in RTMP
 - data: URI scheme
+- Subtitles character re-encoding
 
 
 version 1.0:
diff --git a/configure b/configure
index 32f7eb1..d2b6162 100755
--- a/configure
+++ b/configure
@@ -1380,6 +1380,7 @@ HAVE_LIST="
     glob
     gnu_as
     ibm_asm
+    iconv
     inet_aton
     io_h
     isatty
@@ -3696,6 +3697,7 @@ check_func  getopt
 check_func  getrusage
 check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
 check_func  gettimeofday
+check_func  iconv
 check_func  inet_aton $network_extralibs
 check_func  isatty
 check_func  localtime_r
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 012a31c..060589b 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -3188,6 +3188,13 @@ typedef struct AVCodecContext {
      * - encoding: unused
      */
     AVDictionary *metadata;
+
+    /**
+     * Character encoding of the input subtitles file.
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    char *sub_charenc;
 } AVCodecContext;
 
 AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index 2a31fa6..75c1961 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -403,6 +403,7 @@ static const AVOption options[]={
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
 {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
 {NULL},
 };
 
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index f5ceae4..501084d 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -47,6 +47,9 @@
 #include <stdarg.h>
 #include <limits.h>
 #include <float.h>
+#if HAVE_ICONV
+# include <iconv.h>
+#endif
 
 volatile int ff_avcodec_locked;
 static int volatile entangled_thread_counter = 0;
@@ -1058,6 +1061,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
             ret = AVERROR(EINVAL);
             goto free_and_end;
         }
+        if (!HAVE_ICONV && avctx->codec_type == AVMEDIA_TYPE_SUBTITLE && avctx->sub_charenc) {
+            av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles conversion needs "
+                   "a libavcodec built with iconv support.\n");
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
     }
 end:
     ff_unlock_avcodec();
@@ -1816,6 +1825,64 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
     return ret;
 }
 
+#if HAVE_ICONV
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
+static int recode_subtitle(AVCodecContext *avctx,
+                           AVPacket *outpkt, const AVPacket *inpkt)
+{
+    iconv_t cd = (iconv_t)-1;
+    int ret = 0;
+    char *inb, *outb;
+    size_t inl, outl;
+    AVPacket tmp;
+
+    if (!avctx->sub_charenc) // FIXME add a codec prop for sub codec supporting it
+        goto end;
+
+    cd = iconv_open("UTF-8", avctx->sub_charenc);
+    if (cd == (iconv_t)-1) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
+               "with input character encoding \"%s\"\n", avctx->sub_charenc);
+        ret = AVERROR(errno);
+        goto end;
+    }
+
+    inb = inpkt->data;
+    inl = inpkt->size;
+
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
+    if (ret < 0)
+        goto end;
+    outpkt->data = tmp.data;
+    outpkt->size = tmp.size;
+    outb = outpkt->data;
+    outl = outpkt->size;
+
+    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
+        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
+        outl >= outpkt->size || inl != 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
+               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
+        av_free_packet(&tmp);
+        ret = AVERROR(errno);
+        goto end;
+    }
+    outpkt->size -= outl;
+    outpkt->data[outpkt->size - 1] = '\0';
+
+end:
+    if (cd != (iconv_t)-1)
+        iconv_close(cd);
+    return ret;
+}
+#endif
+
 int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
                              int *got_sub_ptr,
                              AVPacket *avpkt)
@@ -1831,16 +1898,25 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
     avcodec_get_subtitle_defaults(sub);
 
     if (avpkt->size) {
+        AVPacket pkt_recoded;
         AVPacket tmp = *avpkt;
         int did_split = av_packet_split_side_data(&tmp);
         //apply_param_change(avctx, &tmp);
 
-        avctx->pkt = &tmp;
+        pkt_recoded = tmp;
+#if HAVE_ICONV
+        if (recode_subtitle(avctx, &pkt_recoded, &tmp) < 0)
+            pkt_recoded = tmp;
+#endif
+
+        avctx->pkt = &pkt_recoded;
 
         if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
             sub->pts = av_rescale_q(avpkt->pts,
                                     avctx->pkt_timebase, AV_TIME_BASE_Q);
-        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
+        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
+        if (tmp.data != pkt_recoded.data)
+            av_free(pkt_recoded.data);
         sub->format = sub->num_rects && sub->rects[0]->ass;
 
         avctx->pkt = NULL;
-- 
1.8.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130104/eb22329f/attachment.asc>