[FFmpeg-devel] [PATCH] lavu/avstring: add av_get_utf8() function
Nicolas George
george at nsup.org
Thu Nov 14 15:35:45 CET 2013
Le quartidi 24 brumaire, an CCXXII, Stefano Sabatini a écrit :
> Suppose that you have an overflow with PTR+1, then you have PTR+1=0 <
> PTR, in this case the code will misbehave. I don't know if the specs
> explicitly allow this (PTR+1 for every allocated byte pointer should
> not overflow).
I am pretty sure the standard implies that if PTR points to an object, then
PTR+1 can not overflow. Which, in practical cases, means than (type *)-1 can
never be dereferenced.
> >From 7fb8c0f07de33428f3050161e58bb9bb666f6f5e Mon Sep 17 00:00:00 2001
> From: Stefano Sabatini <stefasab at gmail.com>
> Date: Thu, 3 Oct 2013 01:21:40 +0200
> Subject: [PATCH] lavu/avstring: add av_utf8_decode() function
>
> ---
> doc/APIchanges | 3 +++
> libavutil/Makefile | 1 +
> libavutil/avstring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
> libavutil/avstring.h | 40 +++++++++++++++++++++++++++++++
> libavutil/utf8.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> libavutil/version.h | 2 +-
> 6 files changed, 175 insertions(+), 1 deletion(-)
> create mode 100644 libavutil/utf8.c
>
> diff --git a/doc/APIchanges b/doc/APIchanges
> index dfdc159..b292d19 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -15,6 +15,9 @@ libavutil: 2012-10-22
>
> API changes, most recent first:
>
> +2013-11-12 - xxxxxxx - lavu 52.53.100 - avstring.h
> + Add av_utf8_decode() function.
> +
> 2013-11-xx - xxxxxxx - lavc 55.41.100 / 55.25.0 - avcodec.h
> lavu 52.51.100 - frame.h
> Add ITU-R BT.2020 and other not yet included values to color primaries,
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 7b3b439..19540e4 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -155,6 +155,7 @@ TESTPROGS = adler32 \
> sha \
> sha512 \
> tree \
> + utf8 \
> xtea \
>
> TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo
> diff --git a/libavutil/avstring.c b/libavutil/avstring.c
> index eed58fa..135f11a 100644
> --- a/libavutil/avstring.c
> +++ b/libavutil/avstring.c
> @@ -307,6 +307,70 @@ int av_isxdigit(int c)
> return av_isdigit(c) || (c >= 'a' && c <= 'f');
> }
>
> +int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_last,
> + unsigned int flags)
> +{
> + const uint8_t *p = *bufp;
> + uint32_t top;
> + uint64_t code;
> + int ret = 0;
> +
> + if (p > buf_last)
> + return 0;
> +
> + code = *p++;
> +
> + /* first sequence byte starts with 10, or is 1111-1110 or 1111-1111,
> + which is not admitted */
> + if ((code & 0xc0) == 0x80 || code >= 0xFE) {
> + ret = AVERROR(EILSEQ);
> + goto end;
> + }
> + top = (code & 128) >> 1;
> +
> + while (code & top) {
> + int tmp;
> + if (p > buf_last) {
> + ret = AVERROR(EILSEQ); /* incomplete sequence */
> + goto end;
> + }
> +
> + /* we assume the byte to be in the form 10xx-xxxx */
> + tmp = *p++ - 128; /* strip leading 1 */
> + if (tmp>>6) {
> + ret = AVERROR(EILSEQ);
> + goto end;
> + }
> + code = (code<<6) + tmp;
> + top <<= 5;
> + }
> + code &= (top << 1) - 1;
> +
> + if (code >= 1<<31) {
> + ret = AVERROR(EILSEQ); /* out-of-range value */
> + goto end;
> + }
> +
> + *codep = code;
> +
> + if (flags & AV_UTF8_CHECK_FLAG_EXCLUDE_OVERLONG_CODES &&
> + code > 0x10FFFF)
> + ret = AVERROR(EILSEQ);
> + if (flags & AV_UTF8_CHECK_FLAG_EXCLUDE_CONTROL_CODES &&
> + code != 0x9 && code != 0xA && code != 0xD && code < 0x20)
Checking code < 0x20 first may be more efficient, and IMHO easier to
understand.
> + ret = AVERROR(EILSEQ);
> + if (flags & AV_UTF8_CHECK_FLAG_EXCLUDE_SURROGATES &&
> + code >= 0xD800 && code <= 0xDFFF)
> + ret = AVERROR(EILSEQ);
> + if (flags & AV_UTF8_CHECK_FLAG_EXCLUDE_NON_CHARACTERS &&
> + code == 0xFFFE || code == 0xFFFF)
> + ret = AVERROR(EILSEQ);
> +
> +end:
> + *bufp = p;
> + return ret;
> +}
> +
> #ifdef TEST
>
> int main(void)
> diff --git a/libavutil/avstring.h b/libavutil/avstring.h
> index 438ef79..4944a58 100644
> --- a/libavutil/avstring.h
> +++ b/libavutil/avstring.h
> @@ -22,6 +22,7 @@
> #define AVUTIL_AVSTRING_H
>
> #include <stddef.h>
> +#include <stdint.h>
> #include "attributes.h"
>
> /**
> @@ -295,6 +296,45 @@ enum AVEscapeMode {
> int av_escape(char **dst, const char *src, const char *special_chars,
> enum AVEscapeMode mode, int flags);
>
> +#define AV_UTF8_CHECK_FLAG_EXCLUDE_OVERLONG_CODES 1 ///< exclude codepoints over 0x10FFFF
Vocabulary mistake: overlong codes are when you use three bytes or more for
a code in the 0x80-0x7FF that could be coded on two bytes, and so on. They
are, technically, invalid UTF-8.
I do not know if there is a standard name for the code points beyond
0x10FFFF. Maybe "remote planes".
> +#define AV_UTF8_CHECK_FLAG_EXCLUDE_CONTROL_CODES 2 ///< exclude control codes
Nit: you are only excluding ASCII control codes, there are also control
codes in the 0x80-9F range. And there is 0x7F, which is ASCII and control.
> +#define AV_UTF8_CHECK_FLAG_EXCLUDE_SURROGATES 4 ///< exclude UTF-16 surrogates codes
> +#define AV_UTF8_CHECK_FLAG_EXCLUDE_NON_CHARACTERS 8 ///< exclude non-characters - 0xFFFE and 0xFFFF
I really thing they should be rejected by default, i.e. the flags should be:
> +#define AV_UTF8_CHECK_FLAG_ACCEPT_SURROGATES 4 ///< do not reject UTF-16 surrogates codes
> +#define AV_UTF8_CHECK_FLAG_XML_SAFE \
Nit: plural to FLAGS?
> + AV_UTF8_CHECK_FLAG_EXCLUDE_SURROGATES|AV_UTF8_CHECK_FLAG_EXCLUDE_OVERLONG_CODES|AV_UTF8_CHECK_FLAG_EXCLUDE_NON_CHARACTERS|AV_UTF8_CHECK_FLAG_EXCLUDE_CONTROL_CODES
Nit: split line and align.
Also, the "CHECK_" in the name seems quite useless except at making the
lines of code longer.
> +
> +#define AV_UTF8_CHECK_FLAG_LOOSE \
> + AV_UTF8_CHECK_FLAG_EXCLUDE_SURROGATES|AV_UTF8_CHECK_FLAG_EXCLUDE_OVERLONG_CODES|AV_UTF8_CHECK_FLAG_EXCLUDE_NON_CHARACTERS
Naively, I would expect a flag called LOOSE to cause the function to accept
more (invalid) input. IMHO, all these flags should work the other way
around.
> +
> +/**
> + * Read and decode a single UTF-8 code point (character) from the
> + * buffer in *buf, and update *buf to point to the next byte to
> + * decode.
> + *
> + * In case of an invalid byte sequence, the pointer will be updated to
> + * the next byte after the invalid sequence and the function will
> + * return an error code.
> + *
> + * Depending on the specified flags, the function will also fail in
> + * case the decoded code point does not belong to a valid range.
> + *
> + * @note For speed relevant code a carefully implemented use of
speed-relevant with a dash.
> + * GET_UTF8() may be preferred.
> + *
> + * @param code pointer used to return the parsed code in case of success
> + * @param buf pointer to the first byte of the sequence to decode
> +
> + * @param buf_last mark the last byte in the buffer. This is used to
> + * avoid buffer overread (in case of an unfinished
> + * UTF-8 sequence towards the end of the buffer).
> + * @param flags a collection of AV_UTF8_CHECK_FLAG_* flags
> + * @return >= 0 in case a sequence was successfully read, a negative
> + * value in case of invalid sequence
> + */
> +int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_last,
> + unsigned int flags);
> +
> /**
> * @}
> */
> diff --git a/libavutil/utf8.c b/libavutil/utf8.c
> new file mode 100644
> index 0000000..ec7942e
> --- /dev/null
> +++ b/libavutil/utf8.c
> @@ -0,0 +1,66 @@
> +/*
> + * Copyright (c) 2013 Stefano Sabatini
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdio.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/file.h"
> +
> +static void print_sequence(const char *p, int l, int indent)
> +{
> + int i;
> + for (i = 0; i < l; i++)
> + printf("%02X", (uint8_t)p[i]);
> + printf("%*s", indent-l*2, "");
> +}
> +
> +int main(int argc, char **argv)
> +{
> + int ret;
> + char *filename = argv[1];
> + uint8_t *file_buf;
> + size_t file_buf_size;
> + uint32_t code;
> + const uint8_t *p, *endp;
> +
> + ret = av_file_map(filename, &file_buf, &file_buf_size, 0, NULL);
> + if (ret < 0)
> + return 1;
> +
> + p = file_buf;
> + endp = file_buf + file_buf_size-1;
> + while (p <= endp) {
> + int l, r;
> + const uint8_t *p0 = p;
> + r = av_utf8_decode(&code, &p, endp, 0);
> + l = (int)(p-p0);
> + print_sequence(p0, l, 20);
> + if (r >= 0) {
> + printf("%-10d 0x%-10X %-5d ", code, code, l);
> + if (*p0 == '\n') { printf("\\n\n"); }
> + else { printf ("%.*s\n", l, p0); }
> + } else {
> + printf("-> illegal sequence\n");
> + }
> + }
> +
> + av_file_unmap(file_buf, file_buf_size);
> + return 0;
> +}
> diff --git a/libavutil/version.h b/libavutil/version.h
> index bd5e7b8..05c79fa 100644
> --- a/libavutil/version.h
> +++ b/libavutil/version.h
> @@ -75,7 +75,7 @@
> */
>
> #define LIBAVUTIL_VERSION_MAJOR 52
> -#define LIBAVUTIL_VERSION_MINOR 52
> +#define LIBAVUTIL_VERSION_MINOR 53
> #define LIBAVUTIL_VERSION_MICRO 100
>
> #define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
Regards,
--
Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 836 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20131114/baeb5352/attachment.asc>
More information about the ffmpeg-devel
mailing list