[FFmpeg-devel] [PATCH 1/2] lavu: add text_file API.

Nicolas George nicolas.george at normalesup.org
Thu Aug 8 14:08:43 CEST 2013


TODO: version bump, APIChanges entry, !HAVE_ICONV path.

Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
---
 libavutil/Makefile    |    1 +
 libavutil/text_file.c |  262 +++++++++++++++++++++++++++++++++++++++++++++++++
 libavutil/text_file.h |  181 ++++++++++++++++++++++++++++++++++
 3 files changed, 444 insertions(+)
 create mode 100644 libavutil/text_file.c
 create mode 100644 libavutil/text_file.h

diff --git a/libavutil/Makefile b/libavutil/Makefile
index 21746f0..7d59a73 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -107,6 +107,7 @@ OBJS = adler32.o                                                        \
        samplefmt.o                                                      \
        sha.o                                                            \
        sha512.o                                                         \
+       text_file.o                                                      \
        time.o                                                           \
        timecode.o                                                       \
        tree.o                                                           \
diff --git a/libavutil/text_file.c b/libavutil/text_file.c
new file mode 100644
index 0000000..e5f8b78
--- /dev/null
+++ b/libavutil/text_file.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2013 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avassert.h"
+#include "avstring.h"
+#include "bprint.h"
+#include "text_file.h"
+
+#include <iconv.h>
+
+#define COPY_FROM_USER(var) \
+    av_assert0(var ## _user->struct_size <= sizeof(var)); \
+    memcpy(&var, var ## _user, var ## _user->struct_size);
+#define COPY_TO_USER(var) \
+    memcpy(var ## _user, &var, var ## _user->struct_size);
+
+static const struct {
+    unsigned char encoding[9], bom[4], len;
+} byte_order_marks[] = {
+    { "UTF-8",    "\xef\xbb\xbf",     3 },
+    { "UCS-4BE",  "\x00\x00\xfe\xff", 4 },
+    { "UCS-4LE",  "\xff\xfe\x00\x00", 4 },
+    { "UTF-16BE", "\xfe\xff",         2 },
+    { "UTF-16LE", "\xff\xfe",         2 },
+};
+
+static const char *const default_encodings[] = {
+    "UTF-8",
+    "US-ASCII",
+    "WINDOWS-1252",
+    "ISO-8859-1",
+    NULL
+};
+
+static int try_encoding(AVTextFile *tf, const char *encoding)
+{
+    iconv_t cd;
+    AVBPrint bp;
+    char *inbuf, *outbuf, *recoded;
+    size_t insize, outsize, insize_orig;
+    unsigned outsize_int;
+    int ret = 0;
+
+    if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
+        return AVERROR(errno);
+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+    inbuf  = tf->full_data;
+    insize = tf->full_data_size;
+    while (insize) {
+        av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
+        if (outsize_int <= 1) {
+            ret = AVERROR(ENOMEM);
+            break;
+        }
+        outsize_int--;
+        outsize = outsize_int;
+        insize_orig = insize;
+        iconv(cd, &inbuf, &insize, &outbuf, &outsize);
+        if (insize == insize_orig) {
+            ret = AVERROR_INVALIDDATA;
+            break;
+        }
+        bp.len += outsize_int - outsize;
+    }
+    iconv_close(cd);
+    if (ret < 0) {
+        av_bprint_finalize(&bp, NULL);
+        return ret;
+    }
+    av_assert1(!insize);
+    bp.str[bp.len] = 0;
+    if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
+        return ret;
+    av_free(tf->full_data);
+    tf->full_data      = recoded;
+    tf->full_data_size = bp.len;
+    tf->encoding       = encoding;
+    return 0;
+}
+
+static int guess_encoding(AVTextFile *tf)
+{
+    const char *bom_encoding[2] = { NULL, NULL };
+    const char *const *encodings;
+    int ret, i;
+
+    encodings = tf->encodings;
+    if (!encodings) {
+        for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
+            if (!memcmp(tf->full_data, byte_order_marks[i].bom,
+                                       byte_order_marks[i].len)) {
+                encodings = bom_encoding;
+                bom_encoding[0] = byte_order_marks[i].encoding;
+                break;
+            }
+        }
+        if (!encodings)
+            encodings = default_encodings;
+    }
+
+    for (i = 0; encodings[i]; i++)
+        if ((ret = try_encoding(tf, encodings[i])) >= 0)
+            return ret;
+
+    av_strlcpy(tf->error, "Unable to guess character encoding",
+               sizeof(tf->error));
+    return AVERROR_INVALIDDATA;
+}
+
+static void remove_cr(AVTextFile *tf)
+{
+    uint8_t *p, *q, *end;
+
+    p = q = tf->text;
+    end = p + tf->text_size;
+    for (; p < end; p++)
+        if (*p != '\r' || p[1] != '\n')
+            *(q++) = *p;
+    tf->text_size = q - tf->text;
+    *(q++) = 0;
+}
+
+static int split_lines(AVTextFile *tf)
+{
+    size_t i, nb_lines = 0;
+    uint8_t *p, *end = tf->text + tf->text_size;
+
+    if (tf->text_size) {
+        nb_lines++;
+        for (p = tf->text; p < end - 1; p++)
+            if (*p == '\n')
+                nb_lines++;
+    }
+    tf->lines = av_calloc(nb_lines + 1, sizeof(*tf->lines));
+    tf->lines[0] = p = tf->text;
+    for (i = 1; i < nb_lines; i++) {
+        p = memchr(p, '\n', end - p);
+        av_assert1(p);
+        *p = 0;
+        tf->lines[i] = ++p;
+    }
+    if (tf->text_size) {
+        if ((p = memchr(p, '\n', end - p))) {
+            av_assert1(p == end - 1);
+            *p = 0;
+        } else {
+            tf->text_flags |= AV_TEXT_FLAG_NO_EOL;
+        }
+    }
+    tf->nb_lines = nb_lines;
+    return 0;
+}
+
+static int text_file_process(AVTextFile *tf)
+{
+    int ret;
+
+    tf->text_flags = 0;
+    if ((ret = guess_encoding(tf)) < 0)
+        return ret;
+    tf->text      = tf->full_data;
+    tf->text_size = tf->full_data_size;
+
+    if (!memcmp(tf->text, byte_order_marks[0].bom, 3)) {
+        tf->text_size  -= 3;
+        tf->text       += 3;
+        tf->text_flags |= AV_TEXT_FLAG_HAS_BOM;
+    }
+
+    if ((tf->flags & AV_TEXT_FLAG_REMOVE_CR))
+        remove_cr(tf);
+    if ((tf->flags & AV_TEXT_FLAG_SPLIT_LINES))
+        if ((ret = split_lines(tf)) < 0)
+            return ret;
+
+    return 0;
+}
+
+static int text_file_try_read(AVTextFile *tf,
+                              AVTextFileRead callback, void *opaque)
+{
+    AVBPrint bp;
+    unsigned buf_size;
+    uint8_t *buf;
+    int ret;
+
+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+    while (1) {
+        av_bprint_get_buffer(&bp, 512, &buf, &buf_size);
+        if (buf_size <= 1) {
+            ret = AVERROR(ENOMEM);
+            break;
+        }
+        ret = callback(opaque, buf, FFMIN(buf_size - 1, INT_MAX));
+        if (ret < 0) {
+            if (ret == AVERROR_EOF)
+                ret = 0;
+            break;
+        }
+        bp.len += ret;
+    }
+
+    if (ret < 0) {
+        av_bprint_finalize(&bp, NULL);
+        return ret;
+    }
+    if ((ret = av_bprint_finalize(&bp, (char **)&tf->full_data)) < 0)
+        return ret;
+    tf->full_data_size = bp.len;
+    return text_file_process(tf);
+}
+
+static int text_file_read(AVTextFile *tf,
+                          AVTextFileRead callback, void *opaque)
+{
+    int ret;
+
+    *tf->error = 0;
+    if ((ret = text_file_try_read(tf, callback, opaque)) < 0) {
+        if (!*tf->error)
+            av_strerror(ret, tf->error, sizeof(tf->error));
+        av_text_file_free(tf);
+    }
+    return ret;
+}
+
+void av_text_file_free(AVTextFile *tf)
+{
+    tf->text = NULL;
+    av_freep(&tf->lines);
+    av_freep(&tf->full_data);
+    tf->text_size = tf->full_data_size = 0;
+}
+
+int av_text_file_read_callback(AVTextFile *tf_user,
+                               AVTextFileRead callback, void *opaque)
+{
+    AVTextFile tf = { 0 };
+    int ret;
+
+    COPY_FROM_USER(tf);
+    ret = text_file_read(&tf, callback, opaque);
+    COPY_TO_USER(tf);
+    return ret;
+}
diff --git a/libavutil/text_file.h b/libavutil/text_file.h
new file mode 100644
index 0000000..d1bfbd3
--- /dev/null
+++ b/libavutil/text_file.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2013 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TEXT_FILE_H
+#define AVUTIL_TEXT_FILE_H
+
+#include "common.h"
+
+/**
+ * Structure to help read text files.
+ * This API allows to read text files (or other ways of storing text) while
+ * handling the subtleties of character encodings, end-of-line separators,
+ * etc.
+ *
+ * The text returned by this function is always recoded to UTF-8.
+ *
+ * The typical way of using this API is to declare a AVTextFile variable
+ * with the default initialization macro:
+ * AVTextFile tf = { AV_TEXT_FILE_DEFAULT };
+ * Then set fields to control various parts of the process and use it with
+ * the API functions.
+ */
+typedef struct AVTextFile {
+
+    /**
+     * Size of the structure; must be set to sizeof(AVTextFile) to ensure
+     * compatibility with later versions of the library.
+     */
+
+    size_t struct_size;
+
+    /**
+     * Text read from the file. Always terminated by an additional 0.
+     */
+    uint8_t *text;
+
+    /**
+     * Size of text, in bytes, not counting the additional terminating 0.
+     */
+    size_t text_size;
+
+    /**
+     * Full data buffer containing the text; must be freed with av_free()
+     * when no longer needed. Can be different from text due to details such
+     * as byte-order-marks.
+     */
+    uint8_t *full_data;
+
+    /**
+     * Size of full_data, in bytes, not counting the additional 0.
+     */
+    size_t full_data_size;
+
+    /**
+     * Detected encoding; will point to either a static string or an element
+     * of the encodings field.
+     */
+    const char *encoding;
+
+    /**
+     * List of encodings for audodetection, terminated by NULL.
+     * The first encoding in this list that can apply to the file is used.
+     */
+    const char *const *encodings;
+
+    /**
+     * Lines of the file; only relevant if AV_TEXT_FLAG_SPLIT_LINES is set.
+     * Terminated by an additional NULL pointer.
+     */
+    char **lines;
+
+    /**
+     * Number of elements in the lines array, not counting the additional
+     * NULL.
+     */
+    size_t nb_lines;
+
+    /**
+     * Flags to control the processing of the file. See the AV_TEXT_FLAG_*
+     * constants below.
+     */
+    unsigned flags;
+
+    /**
+     * Flags describing features of the file hidden by the conversion. See
+     * the AV_TEXT_FLAG_* constants below.
+     */
+    unsigned text_flags;
+
+    /**
+     * Error message. If something fails, this field will contain a
+     * human-readable error message.
+     */
+    char error[128];
+
+} AVTextFile;
+
+/**
+ * Processing flags.
+ * The following constants apply to the AVTextFile.flags field.
+ */
+enum {
+
+    /**
+     * Split the file into individual lines.
+     * The newline characters are replaced by 0.
+     */
+    AV_TEXT_FLAG_SPLIT_LINES           = 0x1,
+
+    /**
+     * Remove CR (\r) before LF (\n).
+     * In other words, convert DOS-style line breaks to Unix-style.
+     */
+    AV_TEXT_FLAG_REMOVE_CR             = 0x2,
+};
+
+/**
+ * Result flags.
+ * The following constants apply to the AVTextFile.text_flags field.
+ */
+enum {
+
+    /**
+     * The file had a byte order mark.
+     * The first character of the file was U+FEFF ZERO WIDTH NO-BREAK SPACE.
+     */
+    AV_TEXT_FLAG_HAS_BOM               = 0x1,
+
+    /**
+     * The final line of the file was not terminated by a final LF (\n).
+     * Only relevant if lines were split.
+     */
+    AV_TEXT_FLAG_NO_EOL                = 0x2,
+};
+
+/**
+ * Callback to read from a file.
+ * @param opaque    opaque value passed from the caller
+ * @param buf       buffer to fill with the file data
+ * @param buf_size  size of the buffer
+ * @return  the number of bytes read or a negative error code
+ */
+typedef int (*AVTextFileRead)(void *opaque, unsigned char *buf, int buf_size);
+
+/**
+ * Read a text file from a callback.
+ */
+int av_text_file_read_callback(AVTextFile *tf,
+                               AVTextFileRead callback, void *opaque);
+
+/**
+ * Read a text file from the local file system (using stdio).
+ */
+int av_text_file_read_file(AVTextFile *tf, const char *filename);
+
+/**
+ * Free all memory allocated while reading the file.
+ * The corresponding fields are set to NULL.
+ */
+void av_text_file_free(AVTextFile *tf);
+
+#define AV_TEXT_FILE_DEFAULT sizeof(AVTextFile)
+
+#endif /* AVUTIL_TEXT_FILE_H */
-- 
1.7.10.4



More information about the ffmpeg-devel mailing list