[FFmpeg-devel] [PATCH] Whisper audio filter
Vittorio Palmisano
vpalmisano at gmail.com
Thu Jul 10 13:25:41 EEST 2025
It adds a new audio filter for running audio transcriptions with the whisper model.
Documentation and examples are included into the patch.
Signed-off-by: Vittorio Palmisano <vpalmisano at gmail.com>
---
configure | 5 +
doc/filters.texi | 101 ++++++++
libavfilter/Makefile | 2 +
libavfilter/af_whisper.c | 488 +++++++++++++++++++++++++++++++++++++++
libavfilter/allfilters.c | 2 +
5 files changed, 598 insertions(+)
create mode 100644 libavfilter/af_whisper.c
diff --git a/configure b/configure
index 2ccafe7c20..573dfc67dc 100755
--- a/configure
+++ b/configure
@@ -337,6 +337,7 @@ External library support:
--enable-vapoursynth enable VapourSynth demuxer [no]
--disable-xlib disable xlib [autodetect]
--disable-zlib disable zlib [autodetect]
+ --enable-whisper enable whisper filter [no]
The following libraries provide various hardware acceleration features:
--disable-amf disable AMF video encoding code [autodetect]
@@ -2003,6 +2004,7 @@ EXTERNAL_LIBRARY_LIST="
pocketsphinx
vapoursynth
vulkan_static
+ whisper
"
HWACCEL_AUTODETECT_LIBRARY_LIST="
@@ -4059,6 +4061,7 @@ xstack_qsv_filter_deps="libmfx"
xstack_qsv_filter_select="qsvvpp"
pad_vaapi_filter_deps="vaapi_1"
drawbox_vaapi_filter_deps="vaapi_1"
+whisper_filter_deps="whisper"
# examples
avio_http_serve_files_deps="avformat avutil fork"
@@ -7108,6 +7111,8 @@ enabled libvo_amrwbenc && require libvo_amrwbenc vo-amrwbenc/enc_if.h E_IF_in
enabled libvorbis && require_pkg_config libvorbis vorbis vorbis/codec.h vorbis_info_init &&
require_pkg_config libvorbisenc vorbisenc vorbis/vorbisenc.h vorbis_encode_init
+enabled whisper && require_pkg_config whisper "whisper >= 1.7.5" whisper.h whisper_init_from_file_with_params
+
enabled libvpx && {
enabled libvpx_vp8_decoder && {
check_pkg_config libvpx_vp8_decoder "vpx >= 1.4.0" "vpx/vpx_decoder.h vpx/vp8dx.h" vpx_codec_vp8_dx ||
diff --git a/doc/filters.texi b/doc/filters.texi
index ed2956fe75..c00e73478f 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -7682,6 +7682,107 @@ There are 6 samples at -4 dB, 62 at -5 dB, 286 at -6 dB, etc.
In other words, raising the volume by +4 dB does not cause any clipping,
raising it by +5 dB causes clipping for 6 samples, etc.
+ at anchor{whisper}
+ at section whisper
+
+It runs a automatic speech recognition using the OpenAI’s Whisper model.
+
+It requires the whisper.cpp library (https://github.com/ggml-org/whisper.cpp)
+as a pre-requisite. After installing the library it can be enabled using:
+ at code{./configure --enable-whisper}.
+
+The filter has following options:
+
+ at table @option
+ at item model
+The file path of the downloaded whisper.cpp model (mandatory).
+
+ at item language
+The language to use for transcription ('auto' for auto-detect).
+Default value: @code{"auto"}
+
+ at item queue
+The maximum size in milliseconds that will be queued into the filter before
+processing the audio with whisper
+Default value: @code{"3000"}
+
+ at item use_gpu
+If the GPU support should be enabled.
+Default value: @code{"true"}
+
+ at item gpu_device
+The GPU device to use.
+Default value: @code{"0"}
+
+ at item destination
+If set, the transcription output will be sent to the specified file or URL
+(use one of the FFmpeg AVIO protocols); otherwise, the output will be logged as
+info messages.
+The output will also be set in the "lavfi.whisper.text" frame metadata.
+
+ at item format
+The destination format string; it could be "text" (only the transcribed text
+will be sent to the destination), "srt" (subtitle format) or "json".
+Default value: @code{"text"}
+
+ at item vad_model
+Path to the VAD model file. If set, the filter will load an additional voice
+activity detection module (https://github.com/snakers4/silero-vad) that will be
+used to fragment the audio queue; use this option setting a valid path obtained
+from the whisper.cpp repository (e.g. "../whisper.cpp/models/ggml-silero-v5.1.2.bin")
+and increase the queue parameter to an higher value (e.g. 10000)
+
+ at item vad_threshold
+The VAD threshold to use.
+Default value: @code{"0.5"}
+
+ at item vad_min_speech_duration
+The minimum VAD speaking duration in milliseconds.
+Default value: @code{"50"}
+
+ at item vad_min_silence_duration
+The minimum VAD silence duration in milliseconds.
+Default value: @code{"500"}
+
+ at end table
+
+ at subsection Examples
+ at itemize
+
+ at item
+Run a transcription with srt file generation:
+ at example
+ffmpeg -i input.mp4 -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=model=../whisper.cpp/models/ggml-base.en.bin\
+:language=en\
+:queue=3000\
+:destination=output.srt\
+:format=srt" -f null -
+ at end example
+
+ at item
+Run a transcription and send the output in JSON format to an HTTP service:
+ at example
+ffmpeg -i input.mp4 -vn -af "aformat=sample_rates=16000:channel_layouts=mono,whisper=model=../whisper.cpp/models/ggml-base.en.bin\
+:language=en\
+:queue=3000\
+:destination=http\\://localhost\\:3000\
+:format=json' -f null -
+ at end example
+
+ at item
+Transcribe the microphone input using the VAD option:
+ at example
+ffmpeg -loglevel warning -f pulse -i default \
+-af 'highpass=f=200,lowpass=f=3000,aformat=sample_rates=16000:channel_layouts=mono,whisper=model=../whisper.cpp/models/ggml-medium.bin\
+:language=en\
+:queue=10000\
+:destination=-\
+:format=json\
+:vad_model=../whisper.cpp/models/ggml-silero-v5.1.2.bin' -f null -
+ at end example
+
+ at end itemize
+
@c man end AUDIO FILTERS
@chapter Audio Sources
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 9e9153f5b0..e133422ca4 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -188,6 +188,8 @@ OBJS-$(CONFIG_HILBERT_FILTER) += asrc_hilbert.o
OBJS-$(CONFIG_SINC_FILTER) += asrc_sinc.o
OBJS-$(CONFIG_SINE_FILTER) += asrc_sine.o
+OBJS-$(CONFIG_WHISPER_FILTER) += af_whisper.o
+
OBJS-$(CONFIG_ANULLSINK_FILTER) += asink_anullsink.o
# video filters
diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c
new file mode 100644
index 0000000000..81d90a77d7
--- /dev/null
+++ b/libavfilter/af_whisper.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2025 Vittorio Palmisano
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/opt.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/samplefmt.h"
+#include "libavfilter/avfilter.h"
+#include "libavfilter/audio.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+#include "libavutil/internal.h"
+#include "libavformat/avio.h"
+#include "libavutil/thread.h"
+
+#include "formats.h"
+
+#include "whisper.h"
+
+typedef struct WhisperContext {
+ const AVClass *class;
+ char *model_path;
+ char *language;
+ bool use_gpu;
+ int gpu_device;
+ char *vad_model_path;
+ float vad_threshold;
+ int vad_min_speech_duration;
+ int vad_min_silence_duration;
+
+ int queue;
+ char *destination;
+ char *format;
+
+ struct whisper_context *ctx_wsp;
+ struct whisper_vad_context *ctx_vad;
+ struct whisper_vad_params vad_params;
+
+ float *audio_buffer;
+ int audio_buffer_queue_size;
+ int audio_buffer_fill_size;
+ int audio_buffer_vad_size;
+
+ int eof;
+ int64_t next_pts;
+
+ AVIOContext *avio_context;
+ int index;
+ int64_t timestamp;
+} WhisperContext;
+
+static void cb_log_disable(enum ggml_log_level level, const char *text, void *user_data)
+{
+}
+
+static int init(AVFilterContext *ctx)
+{
+ WhisperContext *wctx = ctx->priv;
+
+ ggml_backend_load_all();
+ whisper_log_set(cb_log_disable, NULL);
+
+ // Init whisper context
+ if (!wctx->model_path) {
+ av_log(ctx, AV_LOG_ERROR,
+ "No whisper model path specified. Use the 'model' option.\n");
+ return AVERROR(EINVAL);
+ }
+
+ struct whisper_context_params params =
+ whisper_context_default_params();
+ params.use_gpu = wctx->use_gpu;
+ params.gpu_device = wctx->gpu_device;
+
+ wctx->ctx_wsp =
+ whisper_init_from_file_with_params(wctx->model_path, params);
+ if (wctx->ctx_wsp == NULL) {
+ av_log(ctx, AV_LOG_ERROR,
+ "Failed to initialize whisper context from model: %s\n",
+ wctx->model_path);
+ return AVERROR(EIO);
+ }
+
+ // Init VAD model context
+ if (wctx->vad_model_path) {
+ struct whisper_vad_context_params ctx_params =
+ whisper_vad_default_context_params();
+ ctx_params.n_threads = 4;
+ // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
+ ctx_params.gpu_device = wctx->gpu_device;
+ wctx->ctx_vad =
+ whisper_vad_init_from_file_with_params(wctx->vad_model_path,
+ ctx_params);
+
+ wctx->vad_params = whisper_vad_default_params();
+ wctx->vad_params.threshold = wctx->vad_threshold;
+ wctx->vad_params.min_speech_duration_ms =
+ wctx->vad_min_speech_duration;
+ wctx->vad_params.min_silence_duration_ms =
+ wctx->vad_min_silence_duration;
+ wctx->vad_params.max_speech_duration_s =
+ wctx->audio_buffer_queue_size / 1000.0f;
+ wctx->vad_params.speech_pad_ms = 0;
+ wctx->vad_params.samples_overlap = 0;
+ }
+ // Init buffer
+ wctx->audio_buffer_queue_size =
+ WHISPER_SAMPLE_RATE * wctx->queue / 1000;
+ wctx->audio_buffer =
+ av_malloc(wctx->audio_buffer_queue_size * sizeof(float));
+ if (!wctx->audio_buffer)
+ return AVERROR(ENOMEM);
+
+ wctx->next_pts = AV_NOPTS_VALUE;
+
+ if (wctx->destination && strcmp("", wctx->destination)) {
+ const char *dst = wctx->destination;
+ if (!strcmp("-", dst))
+ dst = "pipe:1";
+ int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);
+
+ if (ret < 0) {
+ av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n",
+ wctx->destination, av_err2str(ret));
+ return ret;
+ }
+
+ wctx->avio_context->direct = AVIO_FLAG_DIRECT;
+ }
+
+ av_log(ctx, AV_LOG_INFO,
+ "Whisper filter initialized: model: %s lang: %s queue: %d ms\n",
+ wctx->model_path, wctx->language, wctx->queue);
+
+ return 0;
+}
+
+static void uninit(AVFilterContext *ctx)
+{
+ WhisperContext *wctx = ctx->priv;
+
+ if (wctx->audio_buffer_fill_size > 0) {
+ av_log(ctx, AV_LOG_WARNING,
+ "Remaining audio buffer %d samples (%d seconds) after stopping\n",
+ wctx->audio_buffer_fill_size,
+ wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
+ }
+
+ if (wctx->ctx_vad) {
+ whisper_vad_free(wctx->ctx_vad);
+ wctx->ctx_vad = NULL;
+ }
+
+ if (wctx->ctx_wsp) {
+ whisper_free(wctx->ctx_wsp);
+ wctx->ctx_wsp = NULL;
+ }
+
+ av_freep(&wctx->audio_buffer);
+
+ if (wctx->avio_context)
+ avio_closep(&wctx->avio_context);
+}
+
+static void run_transcription(AVFilterContext *ctx,
+ AVDictionary **metadata, int end_pos)
+{
+ WhisperContext *wctx = ctx->priv;
+ end_pos = FFMIN(end_pos, wctx->audio_buffer_fill_size);
+
+ if (!wctx->ctx_wsp || end_pos == 0)
+ return;
+
+ float duration = (float) end_pos / WHISPER_SAMPLE_RATE;
+
+ av_log(ctx, AV_LOG_INFO,
+ "run transcription %d/%d samples (%.2f seconds)...\n", end_pos,
+ wctx->audio_buffer_fill_size, duration);
+
+ struct whisper_full_params params =
+ whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+ params.language = wctx->language;
+ params.print_special = 0;
+ params.print_progress = 0;
+ params.print_realtime = 0;
+ params.print_timestamps = 0;
+
+ if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, end_pos) !=
+ 0) {
+ av_log(ctx, AV_LOG_ERROR,
+ "Failed to process audio with whisper.cpp\n");
+ return;
+ }
+
+ const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
+ char *segments_text = NULL;
+
+ for (int i = 0; i < n_segments; ++i) {
+ const bool turn =
+ whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
+ const int64_t t0 =
+ whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
+ const int64_t t1 =
+ whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
+ const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
+ char *text_cleaned = av_strireplace(text + 1, "[BLANK_AUDIO]", "");
+
+ if (av_strnlen(text_cleaned, 1) == 0) {
+ av_freep(&text_cleaned);
+ continue;
+ }
+ av_log(ctx, AV_LOG_INFO, " [%ld-%ld%s]: \"%s\"\n",
+ wctx->timestamp + t0, wctx->timestamp + t1,
+ turn ? " (turn)" : "", text_cleaned);
+
+ if (segments_text) {
+ char *new_text =
+ av_asprintf("%s%s", segments_text, text_cleaned);
+ av_freep(&segments_text);
+ segments_text = new_text;
+ } else
+ segments_text = av_strdup(text_cleaned);
+
+ if (wctx->avio_context) {
+ const int64_t start_t = wctx->timestamp + t0;
+ const int64_t end_t = wctx->timestamp + t1;
+ char *buf = NULL;
+
+ if (!av_strcasecmp(wctx->format, "srt")) {
+ buf =
+ av_asprintf
+ ("%d\n%02ld:%02ld:%02ld.%03ld --> %02ld:%02ld:%02ld.%03ld\n%s\n\n",
+ wctx->index, start_t / 3600000,
+ (start_t / 60000) % 60, (start_t / 1000) % 60,
+ start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
+ (end_t / 1000) % 60, end_t % 1000, text_cleaned);
+ } else if (!av_strcasecmp(wctx->format, "json")) {
+ buf =
+ av_asprintf
+ ("{\"start\":%ld,\"end\":%ld,\"text\":\"%s\",\"turn\":%s}\n",
+ start_t, end_t, text_cleaned,
+ turn ? "true" : "false");
+ } else
+ buf = av_strdup(text_cleaned);
+
+ if (buf) {
+ avio_write(wctx->avio_context, buf, strlen(buf));
+ av_freep(&buf);
+ }
+ }
+
+ av_freep(&text_cleaned);
+ }
+
+ wctx->index++;
+ wctx->timestamp += (int64_t) (duration * 1000);
+
+ if (metadata && segments_text) {
+ av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
+ char *duration_text = av_asprintf("%f", duration);
+ av_dict_set(metadata, "lavfi.whisper.duration", duration_text, 0);
+ av_freep(&duration_text);
+ }
+ av_freep(&segments_text);
+
+ memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos,
+ end_pos * sizeof(float));
+ wctx->audio_buffer_fill_size -= end_pos;
+ wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+ AVFilterContext *ctx = inlink->dst;
+ WhisperContext *wctx = ctx->priv;
+ AVFilterLink *outlink = ctx->outputs[0];
+ AVDictionary **metadata = &frame->metadata;
+
+ const int samples = frame->nb_samples;
+ const float *input_data = (const float *) frame->data[0];
+
+ if (wctx->audio_buffer_fill_size + samples >
+ wctx->audio_buffer_queue_size) {
+ run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
+ }
+
+ memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data,
+ samples * sizeof(float));
+ wctx->audio_buffer_fill_size += samples;
+
+ if (wctx->ctx_vad
+ && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
+ WHISPER_SAMPLE_RATE * (wctx->vad_min_speech_duration +
+ wctx->vad_min_silence_duration) / 1000) {
+ struct whisper_vad_segments *segments =
+ whisper_vad_segments_from_samples(wctx->ctx_vad,
+ wctx->vad_params,
+ wctx->audio_buffer,
+ wctx->
+ audio_buffer_fill_size);
+ wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
+
+ if (!segments) {
+ av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
+ } else {
+ int n_segments = whisper_vad_segments_n_segments(segments);
+
+ if (n_segments > 0) {
+ const int64_t start_ms =
+ whisper_vad_segments_get_segment_t0(segments,
+ n_segments -
+ 1) * 10;
+ const int64_t end_ms =
+ whisper_vad_segments_get_segment_t1(segments,
+ n_segments -
+ 1) * 10;
+ int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
+
+ if (end_pos < wctx->audio_buffer_fill_size) {
+ av_log(ctx, AV_LOG_INFO,
+ "VAD detected %d segments, start: %ld ms, end: %ld ms (buffer: %d ms)\n",
+ n_segments, start_ms, end_ms,
+ 1000 * wctx->audio_buffer_fill_size /
+ WHISPER_SAMPLE_RATE);
+ run_transcription(ctx, metadata, end_pos);
+ }
+ }
+
+ whisper_vad_free_segments(segments);
+ }
+ } else if (wctx->audio_buffer_fill_size >=
+ wctx->audio_buffer_queue_size)
+ run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
+
+ wctx->next_pts =
+ frame->pts + av_rescale_q(frame->nb_samples, (AVRational) {
+ 1, inlink->sample_rate}
+ , inlink->time_base);
+ return ff_filter_frame(outlink, frame);
+}
+
+static int push_last_frame(AVFilterLink *outlink)
+{
+ AVFilterContext *ctx = outlink->src;
+ WhisperContext *wctx = ctx->priv;
+ AVFrame *frame;
+ int n_out = 1;
+
+ if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
+ return 0;
+ frame = ff_get_audio_buffer(outlink, n_out);
+ if (!frame)
+ return AVERROR(ENOMEM);
+
+ av_samples_set_silence(frame->extended_data, 0,
+ n_out,
+ frame->ch_layout.nb_channels, frame->format);
+
+ frame->pts = wctx->next_pts;
+ if (wctx->next_pts != AV_NOPTS_VALUE)
+ wctx->next_pts += av_rescale_q(n_out, (AVRational) {
+ 1, outlink->sample_rate}
+ , outlink->time_base);
+
+ run_transcription(ctx, &frame->metadata, wctx->audio_buffer_fill_size);
+
+ return ff_filter_frame(outlink, frame);
+}
+
+static int activate(AVFilterContext *ctx)
+{
+ AVFilterLink *inlink = ctx->inputs[0];
+ AVFilterLink *outlink = ctx->outputs[0];
+ WhisperContext *wctx = ctx->priv;
+ int64_t pts;
+ int status;
+
+ FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+ if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
+ AVFrame *frame = NULL;
+ int ret;
+
+ ret = ff_inlink_consume_frame(inlink, &frame);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return filter_frame(inlink, frame);
+ }
+
+ if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
+ wctx->eof = status == AVERROR_EOF;
+
+ if (wctx->eof) {
+ push_last_frame(outlink);
+
+ ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
+ return 0;
+ }
+
+ FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+ return FFERROR_NOT_READY;
+}
+
+#define OFFSET(x) offsetof(WhisperContext, x)
+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption whisper_options[] = {
+ { "model", "Path to the whisper.cpp model file", OFFSET(model_path),
+ AV_OPT_TYPE_STRING,.flags = FLAGS },
+ { "language", "Language for transcription ('auto' for auto-detect)",
+ OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"},.flags =
+ FLAGS },
+ { "queue", "Audio queue size in milliseconds", OFFSET(queue),
+ AV_OPT_TYPE_INT, {.i64 = 3000}, 20, INT_MAX,.flags = FLAGS },
+ { "use_gpu", "Use GPU for processing", OFFSET(use_gpu),
+ AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1,.flags = FLAGS },
+ { "gpu_device", "GPU device to use", OFFSET(gpu_device),
+ AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX,.flags = FLAGS },
+ { "destination", "Output destination", OFFSET(destination),
+ AV_OPT_TYPE_STRING, {.str = ""},.flags = FLAGS },
+ { "format", "Output format (text|srt|json)", OFFSET(format),
+ AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
+ { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path),
+ AV_OPT_TYPE_STRING,.flags = FLAGS },
+ { "vad_threshold", "VAD threshold", OFFSET(vad_threshold),
+ AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0,.flags = FLAGS },
+ { "vad_min_speech_duration",
+ "Minimum speech duration in milliseconds for VAD",
+ OFFSET(vad_min_speech_duration), AV_OPT_TYPE_INT, {.i64 = 50}, 20,
+ INT_MAX,.flags = FLAGS },
+ { "vad_min_silence_duration",
+ "Minimum silence duration in milliseconds for VAD",
+ OFFSET(vad_min_silence_duration), AV_OPT_TYPE_INT, {.i64 = 500}, 0,
+ INT_MAX,.flags = FLAGS },
+ { NULL }
+};
+
+static const AVClass whisper_class = {
+ .class_name = "whisper",
+ .item_name = av_default_item_name,
+ .option = whisper_options,
+ .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad whisper_outputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_AUDIO,
+ },
+};
+
+const FFFilter ff_af_whisper = {
+ .p.name = "whisper",
+ .p.description =
+ NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
+ .p.priv_class = &whisper_class,
+ .p.flags = AVFILTER_FLAG_METADATA_ONLY,
+ .init = init,
+ .uninit = uninit,
+ .activate = activate,
+ .priv_size = sizeof(WhisperContext),
+ FILTER_INPUTS(ff_audio_default_filterpad),
+ FILTER_OUTPUTS(whisper_outputs),
+ FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP),
+};
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 409099bf1f..eaf0c8fe6f 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -163,6 +163,8 @@ extern const FFFilter ff_af_virtualbass;
extern const FFFilter ff_af_volume;
extern const FFFilter ff_af_volumedetect;
+extern const FFFilter ff_af_whisper;
+
extern const FFFilter ff_asrc_aevalsrc;
extern const FFFilter ff_asrc_afdelaysrc;
extern const FFFilter ff_asrc_afireqsrc;
--
2.43.0
More information about the ffmpeg-devel
mailing list