[FFmpeg-devel] [PATCH V2 1/2] avcodec/vorbisenc: Add pre-echo detection
Tyler Jones
tdjones879 at gmail.com
Mon Jul 17 18:17:09 EEST 2017
The encoder will attempt to determine the existence of transient
signals by applying a 4th order highpass filter to remove dominant
low frequency waveforms. Frames are then split up into blocks
where the variance is calculated and compared with blocks from
the previous frame. A preecho is only likely to be noticeable when
relatively quiet audio is followed by a loud transient signal.
Signed-off-by: Tyler Jones <tdjones879 at gmail.com>
---
V2 - Properly prefix non-static functions with "ff_"
libavcodec/Makefile | 2 +-
libavcodec/vorbisenc.c | 28 +++++++--
libavcodec/vorbispsy.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++
libavcodec/vorbispsy.h | 79 +++++++++++++++++++++++++
4 files changed, 256 insertions(+), 6 deletions(-)
create mode 100644 libavcodec/vorbispsy.c
create mode 100644 libavcodec/vorbispsy.h
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 59029a853c..8c2beb3315 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -610,7 +610,7 @@ OBJS-$(CONFIG_VMNC_DECODER) += vmnc.o
OBJS-$(CONFIG_VORBIS_DECODER) += vorbisdec.o vorbisdsp.o vorbis.o \
vorbis_data.o
OBJS-$(CONFIG_VORBIS_ENCODER) += vorbisenc.o vorbis.o \
- vorbis_data.o
+ vorbis_data.o vorbispsy.o
OBJS-$(CONFIG_VP3_DECODER) += vp3.o
OBJS-$(CONFIG_VP5_DECODER) += vp5.o vp56.o vp56data.o vp56rac.o
OBJS-$(CONFIG_VP6_DECODER) += vp6.o vp56.o vp56data.o \
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index bf21a3b1ff..5dc803aabb 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -33,6 +33,7 @@
#include "mathops.h"
#include "vorbis.h"
#include "vorbis_enc_data.h"
+#include "vorbispsy.h"
#include "audio_frame_queue.h"
#include "libavfilter/bufferqueue.h"
@@ -136,6 +137,7 @@ typedef struct vorbis_enc_context {
int64_t next_pts;
AVFloatDSPContext *fdsp;
+ VorbisPsyContext *vpctx;
} vorbis_enc_context;
#define MAX_CHANNELS 2
@@ -272,11 +274,12 @@ static int create_vorbis_context(vorbis_enc_context *venc,
vorbis_enc_floor *fc;
vorbis_enc_residue *rc;
vorbis_enc_mapping *mc;
- int i, book, ret;
+ int i, book, ret, blocks;
venc->channels = avctx->channels;
venc->sample_rate = avctx->sample_rate;
- venc->log2_blocksize[0] = venc->log2_blocksize[1] = 11;
+ venc->log2_blocksize[0] = 8;
+ venc->log2_blocksize[1] = 11;
venc->ncodebooks = FF_ARRAY_ELEMS(cvectors);
venc->codebooks = av_malloc(sizeof(vorbis_enc_codebook) * venc->ncodebooks);
@@ -464,6 +467,12 @@ static int create_vorbis_context(vorbis_enc_context *venc,
if ((ret = dsp_init(avctx, venc)) < 0)
return ret;
+ blocks = 1 << (venc->log2_blocksize[1] - venc->log2_blocksize[0]);
+ venc->vpctx = av_mallocz(sizeof(VorbisPsyContext));
+ if (!venc->vpctx || (ret = ff_psy_vorbis_init(venc->vpctx, venc->sample_rate,
+ venc->channels, blocks)) < 0)
+ return AVERROR(ENOMEM);
+
return 0;
}
@@ -1071,22 +1080,23 @@ static void move_audio(vorbis_enc_context *venc, int sf_size)
float *save = venc->saved + ch * frame_size;
const float *input = (float *) cur->extended_data[ch];
const size_t len = cur->nb_samples * sizeof(float);
-
memcpy(offset + sf*sf_size, input, len);
memcpy(save + sf*sf_size, input, len); // Move samples for next frame
}
av_frame_free(&cur);
}
venc->have_saved = 1;
- memcpy(venc->scratch, venc->samples, 2 * venc->channels * frame_size);
+ memcpy(venc->scratch, venc->samples, sizeof(float) * venc->channels * 2 * frame_size);
}
static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
const AVFrame *frame, int *got_packet_ptr)
{
vorbis_enc_context *venc = avctx->priv_data;
- int i, ret, need_more;
+ int i, ret, need_more, ch;
+ int curr_win = 1;
int frame_size = 1 << (venc->log2_blocksize[1] - 1);
+ int block_size = 1 << (venc->log2_blocksize[0] - 1);
vorbis_enc_mode *mode;
vorbis_enc_mapping *mapping;
PutBitContext pb;
@@ -1121,6 +1131,13 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
move_audio(venc, avctx->frame_size);
+ for (ch = 0; ch < venc->channels; ch++) {
+ float *scratch = venc->scratch + 2 * ch * frame_size + frame_size;
+
+ if (!ff_psy_vorbis_block_frame(venc->vpctx, scratch, ch, frame_size, block_size))
+ curr_win = 0;
+ }
+
if (!apply_window_and_mdct(venc))
return 0;
@@ -1252,6 +1269,7 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx)
ff_mdct_end(&venc->mdct[1]);
ff_af_queue_close(&venc->afq);
ff_bufqueue_discard_all(&venc->bufqueue);
+ ff_psy_vorbis_close(venc->vpctx);
av_freep(&avctx->extradata);
diff --git a/libavcodec/vorbispsy.c b/libavcodec/vorbispsy.c
new file mode 100644
index 0000000000..aec62fb7a0
--- /dev/null
+++ b/libavcodec/vorbispsy.c
@@ -0,0 +1,153 @@
+/*
+ * Vorbis encoder psychoacoustic model
+ * Copyright (C) 2017 Tyler Jones
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <math.h>
+
+#include "avcodec.h"
+#include "libavutil/attributes.h"
+#include "vorbispsy.h"
+
+/**
+ * Generate the coefficients for a highpass biquad filter
+ *
+ * @param filter Instance of biquad filter to be initialized
+ * @param Fs Input's sampling frequency
+ * @param Fc Critical frequency for samples to be passed
+ * @param Q Quality factor
+ */
+static av_cold void biquad_filter_init(IIRFilter *filter, int Fs, int Fc, float Q)
+{
+ float k = tan(M_PI * Fc / Fs);
+ float normalize = 1 / (1 + k / Q + k * k);
+
+ filter->b[0] = normalize;
+ filter->b[1] = -2 * normalize;
+ filter->b[2] = normalize;
+
+ filter->a[0] = 1;
+ filter->a[1] = 2 * (k * k - 1) * normalize;
+ filter->a[2] = (1 - k / Q + k * k) * normalize;
+}
+
+/**
+ * Direct Form II implementation for a second order digital filter
+ *
+ * @param filter Filter to be applied to input samples
+ * @param in Single input sample to be filtered
+ * @param delay Array of IIR feedback values
+ * @return Filtered sample
+ */
+static float apply_filter(IIRFilter *filter, float in, float *delay)
+{
+ float ret, w;
+
+ w = filter->a[0] * in - filter->a[1] * delay[0] - filter->a[2] * delay[1];
+ ret = filter->b[0] * w + filter->b[1] * delay[0] + filter->b[2] * delay[1];
+
+ delay[1] = delay[0];
+ delay[0] = w;
+
+ return ret;
+}
+
+/**
+ * Calculate the variance of a block of samples
+ *
+ * @param in Array of input samples
+ * @param length Number of input samples being analyzed
+ * @return The variance for the current block
+ */
+static float variance(const float *in, int length)
+{
+ int i;
+ float mean = 0.0f, square_sum = 0.0f;
+
+ for (i = 0; i < length; i++) {
+ mean += in[i];
+ square_sum += in[i] * in[i];
+ }
+
+ mean /= length;
+ return (square_sum - length * mean * mean) / (length - 1);
+}
+
+av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate,
+ int channels, int blocks)
+{
+ int crit_freq;
+ float Q[2] = {.54, 1.31}; // Quality values for maximally flat cascaded filters
+
+ vpctx->filter_delay = av_mallocz_array(4 * channels, sizeof(vpctx->filter_delay[0]));
+ if (!vpctx->filter_delay)
+ return AVERROR(ENOMEM);
+
+ crit_freq = sample_rate / 4;
+ biquad_filter_init(&vpctx->filter[0], sample_rate, crit_freq, Q[0]);
+ biquad_filter_init(&vpctx->filter[1], sample_rate, crit_freq, Q[1]);
+
+ vpctx->variance = av_mallocz_array(channels * blocks, sizeof(vpctx->variance[0]));
+ if (!vpctx->variance)
+ return AVERROR(ENOMEM);
+
+ vpctx->preecho_thresh = 100.0f;
+
+ return 0;
+}
+
+int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio,
+ int ch, int frame_size, int block_size)
+{
+ int i, block_flag = 1;
+ int blocks = frame_size / block_size;
+ float last_var;
+ const float eps = 1e-4;
+ float *var = vpctx->variance + ch * blocks;
+
+ for (i = 0; i < frame_size; i++) {
+ apply_filter(&vpctx->filter[0], audio[i], vpctx->filter_delay + 4 * ch);
+ apply_filter(&vpctx->filter[1], audio[i], vpctx->filter_delay + 4 * ch + 2);
+ }
+
+ for (i = 0; i < blocks; i++) {
+ last_var = var[i];
+ var[i] = variance(audio + i * block_size, block_size);
+
+ /* A small constant is added to the threshold in order to prevent false
+ * transients from being detected when quiet sounds follow near-silence */
+ if (var[i] > vpctx->preecho_thresh * last_var + eps)
+ block_flag = 0;
+ }
+
+ return block_flag;
+}
+
+av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx)
+{
+ if (vpctx) {
+ if (vpctx->filter_delay)
+ av_freep(&vpctx->filter_delay);
+
+ if (vpctx->variance)
+ av_freep(&vpctx->variance);
+
+ av_freep(&vpctx);
+ }
+}
diff --git a/libavcodec/vorbispsy.h b/libavcodec/vorbispsy.h
new file mode 100644
index 0000000000..021a5e8a28
--- /dev/null
+++ b/libavcodec/vorbispsy.h
@@ -0,0 +1,79 @@
+/*
+ * Vorbis encoder psychoacoustic model
+ * Copyright (C) 2017 Tyler Jones
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Vorbis psychoacoustic model
+ */
+
+#ifndef AVCODEC_VORBISPSY_H
+#define AVCODEC_VORBISPSY_H
+
+#include "libavutil/attributes.h"
+
+/**
+ * Second order IIR Filter
+ */
+typedef struct IIRFilter {
+ float b[3]; ///< Normalized cofficients for numerator of transfer function
+ float a[3]; ///< Normalized coefficiets for denominator of transfer function
+} IIRFilter;
+
+typedef struct VorbisPsyContext {
+ IIRFilter filter[2];
+ float *filter_delay; ///< Direct Form II delay registers for each channel
+ float *variance; ///< Saved variances from previous sub-blocks for each channel
+ float preecho_thresh; ///< Threshold for determining prescence of a preecho
+} VorbisPsyContext;
+
+/**
+ * Initializes the psychoacoustic model context
+ *
+ * @param vpctx Uninitialized pointer to the model context
+ * @param sample_rate Input audio sample rate
+ * @param channels Number of channels being analyzed
+ * @param blocks Number of short blocks for every frame of input
+ * @return 0 on success, negative on failure
+ */
+av_cold int ff_psy_vorbis_init(VorbisPsyContext *vpctx, int sample_rate,
+ int channels, int blocks);
+
+/**
+ * Suggest the type of block to use for encoding the current frame
+ *
+ * Each frame of input is passed through a highpass filter to remove dominant
+ * low-frequency waveforms and the variance of each short block of input is
+ * then calculated. If the variance over this block is significantly more than
+ * blocks from the previous frame, a transient signal is likely present.
+ *
+ * @param audio Pointer to the current channel's input samples
+ * @param ch Current channel being analyzed
+ * @param frame_size Size of a full frame, i.e. the size of the long block
+ * @param block_size Size of the short block
+ * @return The correct blockflag to use for encoding, 0 short and 1 long
+ */
+int ff_psy_vorbis_block_frame(VorbisPsyContext *vpctx, float *audio,
+ int ch, int frame_size, int block_size);
+/**
+ * Closes and frees the memory used by the psychoacoustic model
+ */
+av_cold void ff_psy_vorbis_close(VorbisPsyContext *vpctx);
+#endif /* AVCODEC_VORBISPSY_H */
--
2.13.3
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20170717/e9eae11e/attachment.sig>
More information about the ffmpeg-devel
mailing list