[FFmpeg-devel] [RFC] Generic psychoacoustic model interface

Wed Aug 27 10:35:20 CEST 2008

Here's my first attempt to define codec-agnostic psy model.
Here's an interface for it. I'm not sure about AC3, but
it should be possible to use it with DCA, Vorbis,
MPEG Audio Layers I-III and NBC, maybe WMA too.
In case somebody codes an implementation, of course.
Personally I plan to make my encoder use it backed with
already implemented 3GPP model.
-------------- next part --------------
/*
 * audio encoder psychoacoustic model
 * Copyright (C) 2008 Konstantin Shishkov
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef FFMPEG_AACPSY_H
#define FFMPEG_AACPSY_H

#include "avcodec.h"

/** maximum possible number of bands */
#define MAX_BANDS 128

/**
 * single band psychoacoustic information
 */
typedef struct FFPsyBand{
    float energy;
    float threshold;
    float perceptual_entropy;
}FFPsyBand;

/**
 * windowing related information
 */
typedef struct FFWindowInfo{
    int window_type[2];               ///< window type (short/long/transitional, etc.) - current and previous
    int window_shape;                 ///< window shape (sine/KBD/whatever)
    void *additional_info;            ///< codec-dependent window information
}FFWindowInfo;

/**
 * context used by psychoacoustic model
 */
typedef struct FFPsyContext{
    AVCodecContext *avctx;            ///< encoder context

    FFPsyBand bands[MAX_BANDS];       ///< frame bands information
    FFWindowInfo *win_info;           ///< frame window info

    const uint8_t *long_bands;        ///< scalefactor band sizes for long frame
    int num_long_bands;               ///< number of scalefactor bands for long frame
    const uint8_t *short_bands;       ///< scalefactor band sizes for short frame
    int num_short_bands;              ///< number of scalefactor bands for short frame

    void* model_priv_data;            ///< psychoacoustic model implementation private data
}FFPsyContext;

/**
 * Initialize psychoacoustic model.
 *
 * @param ctx             model context
 * @param avctx           codec context
 * @param long_bands      scalefactor band lengths for long frame
 * @param num_long_bands  number of scalefactor bands for long frame
 * @param short_bands     scalefactor band lengths for short frame
 * @param num_short_bands number of scalefactor bands for short frame
 *
 * @return zero if successful, a negative value if not
 */
int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
                const uint8_t * long_bands, int num_long_bands,
                const uint8_t *short_bands, int num_short_bands);

/**
 * Suggest window sequence for channel.
 *
 * @param ctx       model context
 * @param audio     samples for the current frame
 * @param la        lookahead samples (NULL when unavailable)
 * @param channel   number of channel element to analyze
 * @param prev_type previous window type
 *
 * @return suggested window information in a structure
 */
FFWindowInfo* ff_psy_suggest_window(AACPsyContext *ctx, int16_t *audio, int16_t *la,
                                    int channel, int prev_type);

/**
 * Perform psychoacoustic analysis and set band info.
 *
 * @param ctx   model context
 * @param tag   number of channel element to analyze
 * @param type  channel element type (e.g. ID_SCE or ID_CPE)
 * @param cpe   pointer to the current channel element
 */
void ff_psy_analyze(AACPsyContext *ctx, int tag, int type, ChannelElement *cpe);

/**
 * Cleanup model context at the end.
 *
 * @param ctx model context
 */
void ff_psy_end(AACPsyContext *ctx);

/**************************************************************************
 *                       Audio preprocessing stuff.                       *
 *       This should be moved into some audio filter eventually.          *
 **************************************************************************/
struct FFPsyPreprocessContext;

/**
 * psychoacoustic model audio preprocessing initialization
 */
struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);

/**
 * Preprocess several channel in audio frame in order to compress it better.
 *
 * @param ctx      preprocessing context
 * @param audio    samples to preprocess
 * @param dest     place to put filtered samples
 * @param tag      number of channel group
 * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
 */
void ff_aac_psy_preprocess(struct FFPsyPreprocessContext *ctx, int16_t *audio, int16_t *dest, int tag, int channels);

/**
 * Cleanup audio preprocessing module.
 */
void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);

#endif /* FFMPEG_AACPSY_H */