[FFmpeg-devel] [PATCH 2/2] lavc: support subtitles charset conversion.

Nicolas George nicolas.george at normalesup.org
Tue Jan 8 12:29:36 CET 2013


Thanks for the new version. Only technical points now:

> I don't understand what you mean here: -sub_charenc is used to specify the
> character encoding of the input, so using -sub_charenc copy would mean

For now, sub_charenc is only for input, but at some point I expect someone
will address the character encoding (and LF/CRLF status) of output subtitles
files too. That is not very important though.


L'octidi 18 nivôse, an CCXXI, Clement Boesch a écrit :
> TODO: bump lavc micro
> ---
>  Changelog                    |   2 +
>  configure                    |   2 +
>  libavcodec/avcodec.h         |  18 ++++++++
>  libavcodec/options_table.h   |   1 +
>  libavcodec/utils.c           | 108 +++++++++++++++++++++++++++++++++++++++++--
>  libavformat/assdec.c         |   1 +
>  libavformat/tedcaptionsdec.c |   1 +
>  libavformat/webvttdec.c      |   1 +
>  8 files changed, 131 insertions(+), 3 deletions(-)
> 
> diff --git a/Changelog b/Changelog
> index 1a2e96a..42a38c8 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -3,6 +3,8 @@ releases are sorted from youngest to oldest.
>  
>  version <next>:
>  
> +- Subtitles character re-encoding
> +
>  
>  version 1.1:
>  
> diff --git a/configure b/configure
> index 89e076f..1b119e8 100755
> --- a/configure
> +++ b/configure
> @@ -1380,6 +1380,7 @@ HAVE_LIST="
>      glob
>      gnu_as
>      ibm_asm
> +    iconv
>      inet_aton
>      io_h
>      isatty
> @@ -3698,6 +3699,7 @@ check_func  getopt
>  check_func  getrusage
>  check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
>  check_func  gettimeofday
> +check_func  iconv
>  check_func  inet_aton $network_extralibs
>  check_func  isatty
>  check_func  localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 62f5474..be62fc8 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3195,6 +3195,24 @@ typedef struct AVCodecContext {
>       * - encoding: unused
>       */
>      AVDictionary *metadata;
> +

> +    /**
> +     * Character encoding of the input subtitles file.
> +     * - decoding: set by user, libavformat or libavcodec
> +     * - encoding: unused
> +     */
> +    char *sub_charenc;

Needs a not about not accessing it directly from outside lavc (and possibly
an accessor).

> +
> +    /**
> +     * Subtitles character encoding mode.

> +     * - decoding: set by libavformat or libavcodec, not intended to be used by user apps

Do we consider lavf mandatory for subtitles demuxing? IMHO, from lavc's
point of view, lavf is a "user app".

> +     * - encoding: unused
> +     */
> +    int sub_charenc_mode;

> +#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
> +#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
> +#define FF_SUB_CHARENC_MODE_DECODER_PRE  2  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
> +//#define FF_SUB_CHARENC_MODE_DECODER_POST 3  ///< the AVSubitle data needs to be recoded to UTF-8 after the decoder pass, requires iconv

Nit: enum?

>  } AVCodecContext;
>  
>  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 2a31fa6..75c1961 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -403,6 +403,7 @@ static const AVOption options[]={
>  {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
>  {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
>  {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
>  {NULL},
>  };
>  
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 99b2202..3e2326e 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -47,6 +47,9 @@
>  #include <stdarg.h>
>  #include <limits.h>
>  #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>  
>  volatile int ff_avcodec_locked;
>  static int volatile entangled_thread_counter = 0;
> @@ -1058,6 +1061,34 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
>              ret = AVERROR(EINVAL);
>              goto free_and_end;
>          }
> +        if (avctx->sub_charenc) {
> +            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
> +                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
> +                       "supported with subtitles codecs\n");
> +                ret = AVERROR(EINVAL);
> +                goto free_and_end;
> +            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
> +                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
> +                       "subtitles character encoding will be ignored\n",
> +                       avctx->codec_descriptor->name);
> +                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> +            } else {
> +                /* input character encoding is set for a text based subtitle
> +                 * codec at this point */
> +                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
> +                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DECODER_PRE;
> +
> +                if (!HAVE_ICONV && avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_DECODER_PRE) {
> +                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
> +                           "conversion needs a libavcodec built with iconv support "
> +                           "for this codec\n");
> +                    ret = AVERROR(ENOSYS);
> +                    goto free_and_end;
> +                }
> +            }
> +        } else {
> +            avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> +        }
>      }
>  end:
>      ff_unlock_avcodec();
> @@ -1816,6 +1847,68 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
>      return ret;
>  }
>  
> +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
> +static int recode_subtitle(AVCodecContext *avctx,
> +                           AVPacket *outpkt, const AVPacket *inpkt)
> +{
> +#if HAVE_ICONV
> +    iconv_t cd = (iconv_t)-1;
> +    int ret = 0;
> +    char *inb, *outb;
> +    size_t inl, outl;
> +    AVPacket tmp;
> +#endif
> +
> +    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_DECODER_PRE)
> +        return 0;
> +
> +#if HAVE_ICONV
> +    cd = iconv_open("UTF-8", avctx->sub_charenc);
> +    if (cd == (iconv_t)-1) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> +               "with input character encoding \"%s\"\n", avctx->sub_charenc);
> +        ret = AVERROR(errno);
> +        goto end;
> +    }
> +
> +    inb = inpkt->data;
> +    inl = inpkt->size;
> +
> +    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
> +        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
> +        ret = AVERROR(ENOMEM);
> +        goto end;
> +    }
> +
> +    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
> +    if (ret < 0)
> +        goto end;
> +    outpkt->data = tmp.data;
> +    outpkt->size = tmp.size;
> +    outb = outpkt->data;
> +    outl = outpkt->size;
> +
> +    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> +        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
> +        outl >= outpkt->size || inl != 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> +               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
> +        av_free_packet(&tmp);
> +        ret = AVERROR(errno);
> +        goto end;
> +    }
> +    outpkt->size -= outl;
> +    outpkt->data[outpkt->size - 1] = '\0';
> +
> +end:
> +    if (cd != (iconv_t)-1)
> +        iconv_close(cd);
> +    return ret;
> +#else
> +    av_assert0(!"requesting subtitles recoding without iconv");
> +#endif
> +}
> +
>  int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>                               int *got_sub_ptr,
>                               AVPacket *avpkt)
> @@ -1831,19 +1924,28 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>      avcodec_get_subtitle_defaults(sub);
>  
>      if (avpkt->size) {
> +        AVPacket pkt_recoded;
>          AVPacket tmp = *avpkt;
>          int did_split = av_packet_split_side_data(&tmp);
>          //apply_param_change(avctx, &tmp);
>  
> -        avctx->pkt = &tmp;
> +        pkt_recoded = tmp;
> +        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
> +        if (ret < 0) {
> +            *got_sub_ptr = 0;
> +        } else {
> +        avctx->pkt = &pkt_recoded;
>  
>          if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
>              sub->pts = av_rescale_q(avpkt->pts,
>                                      avctx->pkt_timebase, AV_TIME_BASE_Q);
> -        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
> +        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
> +        if (tmp.data != pkt_recoded.data)
> +            av_free(pkt_recoded.data);
>          sub->format = !(avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB);
> -
>          avctx->pkt = NULL;
> +        }
> +
>          if (did_split) {
>              ff_packet_free_side_data(&tmp);
>              if(ret == tmp.size)
> diff --git a/libavformat/assdec.c b/libavformat/assdec.c
> index 35fcb51..710a171 100644
> --- a/libavformat/assdec.c
> +++ b/libavformat/assdec.c
> @@ -93,6 +93,7 @@ static int ass_read_header(AVFormatContext *s)
>      avpriv_set_pts_info(st, 64, 1, 100);
>      st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;

>      st->codec->codec_id= AV_CODEC_ID_SSA;
> +    st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;

lavf should not access the final AVCodecContext fields directly. Also,
usually, FF_* macros are for intra-use only (but there are exceptions).

Another problem: at this point, sub_charenc is still NULL, and therefore
DO_NOTHING is redundant. If this code path leads to sub_charenc being
non-NULL, that means it has been set by the user: ignoring it seems wrong.
IMHO, the demuxer should only set the mode to DO_NOTHING if it actually does
something with the encoding.

(ASS files in something else than UTF-8 exist; I am sure malformed webftt
files exist too somewhere.)

>  
>      header_remaining= INT_MAX;
>  
> diff --git a/libavformat/tedcaptionsdec.c b/libavformat/tedcaptionsdec.c
> index 85bed0a..69d0d17 100644
> --- a/libavformat/tedcaptionsdec.c
> +++ b/libavformat/tedcaptionsdec.c
> @@ -296,6 +296,7 @@ static av_cold int tedcaptions_read_header(AVFormatContext *avf)
>          return AVERROR(ENOMEM);
>      st->codec->codec_type     = AVMEDIA_TYPE_SUBTITLE;
>      st->codec->codec_id       = CODEC_ID_TEXT;
> +    st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
>      avpriv_set_pts_info(st, 64, 1, 1000);
>      st->probe_packets = 0;
>      st->start_time    = 0;
> diff --git a/libavformat/webvttdec.c b/libavformat/webvttdec.c
> index 694a020..42d8162 100644
> --- a/libavformat/webvttdec.c
> +++ b/libavformat/webvttdec.c
> @@ -66,6 +66,7 @@ static int webvtt_read_header(AVFormatContext *s)
>      avpriv_set_pts_info(st, 64, 1, 1000);
>      st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
>      st->codec->codec_id   = AV_CODEC_ID_WEBVTT;
> +    st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
>  
>      av_bprint_init(&header, 0, AV_BPRINT_SIZE_UNLIMITED);
>      av_bprint_init(&cue,    0, AV_BPRINT_SIZE_UNLIMITED);

Regards,

-- 
  Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130108/ea627277/attachment.asc>


More information about the ffmpeg-devel mailing list