[FFmpeg-devel] [PATCH] avfilter: port scaletempo filter from mpv

Sat Dec 2 21:13:42 EET 2017

Signed-off-by: Paul B Mahol <onemda at gmail.com>
---
 libavfilter/Makefile        |   1 +
 libavfilter/af_scaletempo.c | 529 ++++++++++++++++++++++++++++++++++++++++++++
 libavfilter/allfilters.c    |   1 +
 3 files changed, 531 insertions(+)
 create mode 100644 libavfilter/af_scaletempo.c

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1c0cc1da80..4c025c8d07 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -107,6 +107,7 @@ OBJS-$(CONFIG_PAN_FILTER)                    += af_pan.o
 OBJS-$(CONFIG_REPLAYGAIN_FILTER)             += af_replaygain.o
 OBJS-$(CONFIG_RESAMPLE_FILTER)               += af_resample.o
 OBJS-$(CONFIG_RUBBERBAND_FILTER)             += af_rubberband.o
+OBJS-$(CONFIG_SCALETEMPO_FILTER)             += af_scaletempo.o
 OBJS-$(CONFIG_SIDECHAINCOMPRESS_FILTER)      += af_sidechaincompress.o
 OBJS-$(CONFIG_SIDECHAINGATE_FILTER)          += af_agate.o
 OBJS-$(CONFIG_SILENCEDETECT_FILTER)          += af_silencedetect.o
diff --git a/libavfilter/af_scaletempo.c b/libavfilter/af_scaletempo.c
new file mode 100644
index 0000000000..1e673d3e34
--- /dev/null
+++ b/libavfilter/af_scaletempo.c
@@ -0,0 +1,529 @@
+/*
+ * scaletempo audio filter
+ *
+ * scale tempo while maintaining pitch
+ * (WSOLA technique with cross correlation)
+ * inspired by SoundTouch library by Olli Parviainen
+ *
+ * basic algorithm
+ *   - produce 'stride' output samples per loop
+ *   - consume stride*scale input samples per loop
+ *
+ * to produce smoother transitions between strides, blend next overlap
+ * samples from last stride with correlated samples of current input
+ *
+ * Copyright (c) 2007 Robert Juliano
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
+#include "libavutil/samplefmt.h"
+
+#include "avfilter.h"
+#include "audio.h"
+#include "internal.h"
+
+typedef struct ScaleTempoContext
+{
+    AVClass *class;
+
+    // stride
+    float scale;
+    float speed;
+    int frames_stride;
+    float frames_stride_scaled;
+    float frames_stride_error;
+    int bytes_per_frame;
+    int bytes_stride;
+    int bytes_queue;
+    int bytes_queued;
+    int bytes_to_slide;
+    int8_t *buf_queue;
+    // overlap
+    int samples_overlap;
+    int samples_standing;
+    int bytes_overlap;
+    int bytes_standing;
+    void *buf_overlap;
+    void *table_blend;
+    void (*output_overlap)(struct ScaleTempoContext *s, void *out_buf,
+                           int bytes_off);
+    // best overlap
+    int frames_search;
+    int num_channels;
+    void *buf_pre_corr;
+    void *table_window;
+    int (*best_overlap_offset)(struct ScaleTempoContext *s);
+    // command line
+    float scale_nominal;
+    float ms_stride;
+    float percent_overlap;
+    float ms_search;
+#define SCALE_TEMPO 1
+#define SCALE_PITCH 2
+    int speed_opt;
+
+    int64_t pts;
+} ScaleTempoContext;
+
+static int query_formats(AVFilterContext *ctx)
+{
+    AVFilterChannelLayouts *layouts = NULL;
+    AVFilterFormats        *formats = NULL;
+    static const enum AVSampleFormat sample_fmts[] = {
+        AV_SAMPLE_FMT_S16,
+        AV_SAMPLE_FMT_FLT,
+        AV_SAMPLE_FMT_NONE
+    };
+    int ret;
+
+    layouts = ff_all_channel_counts();
+    if (!layouts) {
+        return AVERROR(ENOMEM);
+    }
+    ret = ff_set_common_channel_layouts(ctx, layouts);
+    if (ret < 0)
+        return ret;
+
+    formats = ff_make_format_list(sample_fmts);
+    if (!formats) {
+        return AVERROR(ENOMEM);
+    }
+    ret = ff_set_common_formats(ctx, formats);
+    if (ret < 0)
+        return ret;
+
+    formats = ff_all_samplerates();
+    if (!formats) {
+        return AVERROR(ENOMEM);
+    }
+    return ff_set_common_samplerates(ctx, formats);
+}
+
+static int fill_queue(AVFilterContext *ctx, AVFrame *in, int offset)
+{
+    ScaleTempoContext *s = ctx->priv;
+    int bytes_in = in->nb_samples * s->bytes_per_frame - offset;
+    int offset_unchanged = offset;
+
+    if (s->bytes_to_slide > 0) {
+        if (s->bytes_to_slide < s->bytes_queued) {
+            int bytes_move = s->bytes_queued - s->bytes_to_slide;
+
+            memmove(s->buf_queue, s->buf_queue + s->bytes_to_slide, bytes_move);
+            s->bytes_to_slide = 0;
+            s->bytes_queued = bytes_move;
+        } else {
+            int bytes_skip;
+
+            s->bytes_to_slide -= s->bytes_queued;
+            bytes_skip = FFMIN(s->bytes_to_slide, bytes_in);
+            s->bytes_queued = 0;
+            s->bytes_to_slide -= bytes_skip;
+            offset += bytes_skip;
+            bytes_in -= bytes_skip;
+        }
+    }
+
+    if (bytes_in > 0) {
+        int bytes_copy = FFMIN(s->bytes_queue - s->bytes_queued, bytes_in);
+
+        memcpy(s->buf_queue + s->bytes_queued, in->data[0] + offset, bytes_copy);
+        s->bytes_queued += bytes_copy;
+        offset += bytes_copy;
+    }
+
+    return offset - offset_unchanged;
+}
+
+#define UNROLL_PADDING (4 * 4)
+
+static int best_overlap_offset_float(ScaleTempoContext *s)
+{
+    float best_corr = INT_MIN;
+    int i, off, best_off = 0;
+    float *ppc;
+    float *pw  = s->table_window;
+    float *po  = s->buf_overlap;
+    float *search_start;
+
+    po += s->num_channels;
+    ppc = s->buf_pre_corr;
+    for (i = s->num_channels; i < s->samples_overlap; i++)
+        *ppc++ = *pw++ **po++;
+
+    search_start = (float *)s->buf_queue + s->num_channels;
+    for (off = 0; off < s->frames_search; off++) {
+        float corr = 0;
+        float *ps = search_start;
+        ppc = s->buf_pre_corr;
+        for (i = s->num_channels; i < s->samples_overlap; i++)
+            corr += *ppc++ **ps++;
+        if (corr > best_corr) {
+            best_corr = corr;
+            best_off  = off;
+        }
+        search_start += s->num_channels;
+    }
+
+    return best_off * 4 * s->num_channels;
+}
+
+static int best_overlap_offset_s16(ScaleTempoContext *s)
+{
+    int64_t best_corr = INT64_MIN;
+    int i, off, best_off = 0;
+    int32_t *ppc;
+    int16_t *search_start;
+
+    int32_t *pw  = s->table_window;
+    int16_t *po  = s->buf_overlap;
+    po += s->num_channels;
+    ppc = s->buf_pre_corr;
+    for (i = s->num_channels; i < s->samples_overlap; i++)
+        *ppc++ = (*pw++ **po++) >> 15;
+
+    search_start = (int16_t *)s->buf_queue + s->num_channels;
+    for (off = 0; off < s->frames_search; off++) {
+        int64_t corr = 0;
+        int16_t *ps = search_start;
+        long i;
+
+        ppc = s->buf_pre_corr;
+        ppc += s->samples_overlap - s->num_channels;
+        ps  += s->samples_overlap - s->num_channels;
+        i  = -(s->samples_overlap - s->num_channels);
+        do {
+            corr += ppc[i + 0] * ps[i + 0];
+            corr += ppc[i + 1] * ps[i + 1];
+            corr += ppc[i + 2] * ps[i + 2];
+            corr += ppc[i + 3] * ps[i + 3];
+            i += 4;
+        } while (i < 0);
+        if (corr > best_corr) {
+            best_corr = corr;
+            best_off  = off;
+        }
+        search_start += s->num_channels;
+    }
+
+    return best_off * 2 * s->num_channels;
+}
+
+static void output_overlap_float(ScaleTempoContext *s, void *buf_out,
+                                 int bytes_off)
+{
+    float *pout = buf_out;
+    float *pb   = s->table_blend;
+    float *po   = s->buf_overlap;
+    float *pin  = (float *)(s->buf_queue + bytes_off);
+    int i;
+
+    for (i = 0; i < s->samples_overlap; i++) {
+        *pout++ = *po - *pb++ *(*po - *pin++);
+        po++;
+    }
+}
+
+static void output_overlap_s16(ScaleTempoContext *s, void *buf_out,
+                               int bytes_off)
+{
+    int16_t *pout = buf_out;
+    int32_t *pb   = s->table_blend;
+    int16_t *po   = s->buf_overlap;
+    int16_t *pin  = (int16_t *)(s->buf_queue + bytes_off);
+    int i;
+
+    for (i = 0; i < s->samples_overlap; i++) {
+        *pout++ = *po - ((*pb++ *(*po - *pin++)) >> 16);
+        po++;
+    }
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext  *ctx = inlink->dst;
+    AVFilterLink *outlink = ctx->outputs[0];
+    ScaleTempoContext *s  = ctx->priv;
+    int offset_in, in_samples, ret = 0;
+    uint8_t *pout;
+    AVFrame *out;
+
+    if (s->scale == 1.0) {
+        return ff_filter_frame(outlink, in);
+    }
+
+    if (s->pts == AV_NOPTS_VALUE)
+        s->pts = in->pts;
+
+    in_samples = in->nb_samples;
+    out = ff_get_audio_buffer(outlink, ((int)(in_samples / s->frames_stride_scaled) + 1) * s->frames_stride);
+    if (!out) {
+        av_frame_free(&in);
+        return AVERROR(ENOMEM);
+    }
+    av_frame_copy_props(out, in);
+
+    offset_in = fill_queue(ctx, in, 0);
+    pout = out->data[0];
+    while (s->bytes_queued >= s->bytes_queue) {
+        int bytes_off = 0, ti;
+        float tf;
+
+        // output stride
+        if (s->output_overlap) {
+            if (s->best_overlap_offset)
+                bytes_off = s->best_overlap_offset(s);
+            s->output_overlap(s, pout, bytes_off);
+        }
+        memcpy(pout + s->bytes_overlap,
+               s->buf_queue + bytes_off + s->bytes_overlap,
+               s->bytes_standing);
+        pout += s->bytes_stride;
+
+        // input stride
+        memcpy(s->buf_overlap,
+               s->buf_queue + bytes_off + s->bytes_stride,
+               s->bytes_overlap);
+        tf = s->frames_stride_scaled + s->frames_stride_error;
+        ti = (int)tf;
+        s->frames_stride_error = tf - ti;
+        s->bytes_to_slide = ti * s->bytes_per_frame;
+
+        offset_in += fill_queue(ctx, in, offset_in);
+    }
+
+    out->nb_samples = (pout - (uint8_t *)out->data[0]) / (s->bytes_per_frame);
+    av_frame_free(&in);
+    if (out->nb_samples) {
+        out->pts = s->pts;
+        s->pts += av_rescale_q(out->nb_samples,
+                               (AVRational){1, outlink->sample_rate},
+                               outlink->time_base);
+
+        ret = ff_filter_frame(outlink, out);
+    } else {
+        av_frame_free(&out);
+    }
+    return ret;
+}
+
+static void update_speed(AVFilterContext *ctx, float speed)
+{
+    ScaleTempoContext *s = ctx->priv;
+    double factor;
+
+    s->speed = speed;
+
+    factor = (s->speed_opt & SCALE_PITCH) ? 1.0 / s->speed : s->speed;
+    s->scale = factor * s->scale_nominal;
+
+    s->frames_stride_scaled = s->scale * s->frames_stride;
+    s->frames_stride_error = FFMIN(s->frames_stride_error, s->frames_stride_scaled);
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    ScaleTempoContext *s = ctx->priv;
+    float srate = inlink->sample_rate / 1000.0;
+    int frames_overlap, nch = inlink->channels;
+    int bps, use_int = 0;
+
+    if (inlink->format == AV_SAMPLE_FMT_S16) {
+        use_int = 1;
+        bps = 2;
+    } else {
+        bps = 4;
+    }
+
+    s->frames_stride = srate * s->ms_stride;
+    s->bytes_stride  = s->frames_stride * bps * nch;
+    s->speed = 1.f;
+    s->pts = AV_NOPTS_VALUE;
+
+    update_speed(ctx, s->speed);
+
+    frames_overlap = s->frames_stride * s->percent_overlap;
+    if (frames_overlap <= 0) {
+        s->bytes_standing   = s->bytes_stride;
+        s->samples_standing = s->bytes_standing / bps;
+        s->output_overlap   = NULL;
+        s->bytes_overlap    = 0;
+    } else {
+        s->samples_overlap  = frames_overlap * nch;
+        s->bytes_overlap    = frames_overlap * nch * bps;
+        s->bytes_standing   = s->bytes_stride - s->bytes_overlap;
+        s->samples_standing = s->bytes_standing / bps;
+        s->buf_overlap      = av_realloc(s->buf_overlap, s->bytes_overlap);
+        s->table_blend      = av_realloc(s->table_blend, s->bytes_overlap * 4);
+        if (!s->buf_overlap || !s->table_blend) {
+            return AVERROR(ENOMEM);
+        }
+        memset(s->buf_overlap, 0, s->bytes_overlap);
+        if (use_int) {
+            int32_t *pb = s->table_blend;
+            int64_t blend = 0;
+            int i, j;
+
+            for (i = 0; i < frames_overlap; i++) {
+                int32_t v = blend / frames_overlap;
+                for (j = 0; j < nch; j++)
+                    *pb++ = v;
+                blend += 65536; // 2^16
+            }
+            s->output_overlap = output_overlap_s16;
+        } else {
+            float *pb = s->table_blend;
+            int i;
+
+            for (i = 0; i < frames_overlap; i++) {
+                float v = i / (float)frames_overlap;
+                int j;
+
+                for (j = 0; j < nch; j++)
+                    *pb++ = v;
+            }
+            s->output_overlap = output_overlap_float;
+        }
+    }
+
+    s->frames_search = (frames_overlap > 1) ? srate * s->ms_search : 0;
+    if (s->frames_search <= 0) {
+        s->best_overlap_offset = NULL;
+    } else {
+        if (use_int) {
+            int64_t t = frames_overlap;
+            int32_t n = 8589934588LL / (t * t); // 4 * (2^31 - 1) / t^2
+            int32_t *pw;
+            int i, j;
+
+            s->buf_pre_corr = av_realloc(s->buf_pre_corr,
+                                         s->bytes_overlap * 2 + UNROLL_PADDING);
+            s->table_window = av_realloc(s->table_window,
+                                         s->bytes_overlap * 2 - nch * bps * 2);
+            if (!s->buf_pre_corr || !s->table_window) {
+                return AVERROR(ENOMEM);
+            }
+            memset((char *)s->buf_pre_corr + s->bytes_overlap * 2, 0,
+                   UNROLL_PADDING);
+            pw = s->table_window;
+            for (i = 1; i < frames_overlap; i++) {
+                int32_t v = (i * (t - i) * n) >> 15;
+                for (j = 0; j < nch; j++)
+                    *pw++ = v;
+            }
+            s->best_overlap_offset = best_overlap_offset_s16;
+        } else {
+            float *pw;
+            int i, j;
+
+            s->buf_pre_corr = av_realloc(s->buf_pre_corr, s->bytes_overlap);
+            s->table_window = av_realloc(s->table_window,
+                                         s->bytes_overlap - nch * bps);
+            if (!s->buf_pre_corr || !s->table_window) {
+                return AVERROR(ENOMEM);
+            }
+            pw = s->table_window;
+            for (i = 1; i < frames_overlap; i++) {
+                float v = i * (frames_overlap - i);
+                for (j = 0; j < nch; j++)
+                    *pw++ = v;
+            }
+            s->best_overlap_offset = best_overlap_offset_float;
+        }
+    }
+
+    s->bytes_per_frame = bps * nch;
+    s->num_channels    = nch;
+
+    s->bytes_queue = (s->frames_search + s->frames_stride + frames_overlap)
+                     * bps * nch;
+    s->buf_queue = av_realloc(s->buf_queue, s->bytes_queue + UNROLL_PADDING);
+    if (!s->buf_queue) {
+        return AVERROR(ENOMEM);
+    }
+
+    s->bytes_queued = 0;
+    s->bytes_to_slide = 0;
+
+    return 0;
+}
+
+static void uninit(AVFilterContext *ctx)
+{
+    ScaleTempoContext *s = ctx->priv;
+
+    av_freep(&s->buf_queue);
+    av_freep(&s->buf_overlap);
+    av_freep(&s->buf_pre_corr);
+    av_freep(&s->table_blend);
+    av_freep(&s->table_window);
+}
+
+#define OFFSET(x) offsetof(ScaleTempoContext, x)
+#define AF        AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption scaletempo_options[] = {
+    { "scale", "set nominal amount to scale tempo", OFFSET(scale_nominal), AV_OPT_TYPE_FLOAT, {.dbl=1.0}, 0.01, 10, AF },
+    { "stride", "set length in ms to output each stride", OFFSET(ms_stride), AV_OPT_TYPE_FLOAT, {.dbl= 60}, 0.01, 1000, AF },
+    { "overlap", "set percentage of stride to overlap", OFFSET(percent_overlap), AV_OPT_TYPE_FLOAT, {.dbl=.2}, 0, 1, AF },
+    { "search", "set length in ms to search for best overlap position", OFFSET(ms_search), AV_OPT_TYPE_FLOAT, {.dbl=14}, 0.01, 1000, AF },
+    { "speed", "set response to tempo change", OFFSET(speed_opt), AV_OPT_TYPE_INT, {.i64=1}, 0, 3, AF, "speed" },
+    { "none",   NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 0, AF, "speed" },
+    { "tempo",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, AF, "speed" },
+    { "pitch",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, AF, "speed" },
+    { "both",   NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, AF, "speed" },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(scaletempo);
+
+static const AVFilterPad scaletempo_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_AUDIO,
+        .filter_frame = filter_frame,
+        .config_props = config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad scaletempo_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_AUDIO,
+    },
+    { NULL }
+};
+
+AVFilter ff_af_scaletempo = {
+    .name            = "scaletempo",
+    .description     = NULL_IF_CONFIG_SMALL("Scale audio tempo while maintaining pitch."),
+    .uninit          = uninit,
+    .query_formats   = query_formats,
+    .priv_size       = sizeof(ScaleTempoContext),
+    .priv_class      = &scaletempo_class,
+    .inputs          = scaletempo_inputs,
+    .outputs         = scaletempo_outputs,
+};
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index fc212e58db..5e622b9ad4 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -118,6 +118,7 @@ static void register_all(void)
     REGISTER_FILTER(REPLAYGAIN,     replaygain,     af);
     REGISTER_FILTER(RESAMPLE,       resample,       af);
     REGISTER_FILTER(RUBBERBAND,     rubberband,     af);
+    REGISTER_FILTER(SCALETEMPO,     scaletempo,     af);
     REGISTER_FILTER(SIDECHAINCOMPRESS, sidechaincompress, af);
     REGISTER_FILTER(SIDECHAINGATE,  sidechaingate,  af);
     REGISTER_FILTER(SILENCEDETECT,  silencedetect,  af);
-- 
2.11.0