[FFmpeg-devel] [PATCH] Whisper audio filter

Thu Jul 10 02:37:46 EEST 2025

Hi Vittorio

On Wed, Jul 09, 2025 at 09:23:48AM +0200, Vittorio Palmisano wrote:
> It adds a new audio filter for running audio transcriptions with the whisper model.

Iam happy to see someone contribute a whisper filter!

[...]
> + at example

> +ffmpeg -i input.mp4 -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=

Is there a reason why we convert to 16khz mono here ?

> +model=../whisper.cpp/models/ggml-base.en.bin\

It would be nice if the models would be in a standard location, so teh user
just has to specify the model name and not the path

Maybe teh filter could check some "standard" locations
I dont know what path is standard, but maybe something like:
/usr/local/share/whisper.cpp/models
~/.whisper.cpp/models

> +:language=en\
> +:queue=3000\
> +:destination=output.srt\

> +:format=srt" -f null -

format can be deducted from the destination file extension.

I tried this:

./ffmpeg -i matrixbench_mpeg2.mpg -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=model=/home/michael/whisper.cpp/models/ggml-base.en.bin:language=en:queue=3000:destination=output.srt:format=srt" -f null -

but the output.srt is empty (0 bytes)

[...]

> +static void cb_log_disable(enum ggml_log_level, const char *, void *) {}

libavfilter/af_whisper.c: In function ‘cb_log_disable’:
libavfilter/af_whisper.c:75:28: error: parameter name omitted
   75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {}

libavfilter/af_whisper.c:75:49: error: parameter name omitted
   75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {}
      |                                                 ^~~~~~~~~~~~
libavfilter/af_whisper.c:75:63: error: parameter name omitted
   75 | static void cb_log_disable(enum ggml_log_level, const char *, void *) {}

> +
> +static int init(AVFilterContext *ctx)
> +{
> +    WhisperContext *wctx = ctx->priv;
> +
> +    ggml_backend_load_all();
> +    whisper_log_set(cb_log_disable, NULL);
> +
> +    // Init whisper context
> +    if (!wctx->model_path)
> +    {
> +        av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    struct whisper_context_params params = whisper_context_default_params();
> +    params.use_gpu = wctx->use_gpu;
> +    params.gpu_device = wctx->gpu_device;
> +
> +    wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
> +    if (wctx->ctx_wsp == NULL)
> +    {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
> +        return AVERROR(EIO);
> +    }
> +
> +    wctx->whisper_state = whisper_init_state(wctx->ctx_wsp);
> +    if (wctx->whisper_state == NULL)
> +    {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to get whisper state from context\n");
> +        whisper_free(wctx->ctx_wsp);
> +        wctx->ctx_wsp = NULL;
> +        return AVERROR(EIO);
> +    }
> +
> +    // Init VAD model context
> +    if (wctx->vad_model_path)
> +    {
> +        struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
> +        ctx_params.n_threads = 4;
> +        // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
> +        ctx_params.gpu_device = wctx->gpu_device;
> +        wctx->ctx_vad = whisper_vad_init_from_file_with_params(
> +            wctx->vad_model_path,
> +            ctx_params);
> +
> +        wctx->vad_params = whisper_vad_default_params();
> +        wctx->vad_params.threshold = wctx->vad_threshold;
> +        wctx->vad_params.min_speech_duration_ms = wctx->vad_min_speech_duration;
> +        wctx->vad_params.min_silence_duration_ms = wctx->vad_min_silence_duration;

> +        wctx->vad_params.max_speech_duration_s = (float)(wctx->audio_buffer_queue_size / 1000.0f);

teh float cast is unneeded

> +        wctx->vad_params.speech_pad_ms = 0;
> +        wctx->vad_params.samples_overlap = 0;
> +    }
> +
> +    // Init buffer
> +    wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000;
> +    wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * sizeof(float));
> +    if (!wctx->audio_buffer)
> +    {
> +        return AVERROR(ENOMEM);
> +    }
> +

> +    wctx->audio_buffer_fill_size = 0;
> +
> +    wctx->next_pts = AV_NOPTS_VALUE;
> +
> +    wctx->avio_context = NULL;

arent things already initialized to 0 ?

> +    if (wctx->destination && strcmp("", wctx->destination))

> +    {

> +        int ret = 0;

useless initialization

> +
> +        if (!strcmp("-", wctx->destination))
> +        {
> +            ret = avio_open(&wctx->avio_context, "pipe:1", AVIO_FLAG_WRITE);
> +        }
> +        else
> +        {
> +            ret = avio_open(&wctx->avio_context, wctx->destination, AVIO_FLAG_WRITE);
> +        }

const char *dst = wctx->destination;
if (!strcmp("-", wctx->destination))
    dst = "pipe:1";
int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);

[...]
> +    if (segments_text)
> +    {
> +        av_free(segments_text);
> +    }

the NULL check isnt needed and please use av_freep(&) instead of av_free()
as it clears the pointer and thats just more robust

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The educated differ from the uneducated as much as the living from the
dead. -- Aristotle 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20250710/0a2563cf/attachment.sig>