From af5833e29819810f2d83228228a9a3077e5ccd93 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 31 May 2024 11:37:29 +0300 Subject: [PATCH] whisper : remove `speed_up` and `phase_vocoder*` functions (#2198) * whisper : fix cast warning * whisper : remove phase_vocoder functions, ref #2195 * whisper : remove speed_up from whisper_full_params, closes #2195 --- bindings/go/examples/go-whisper/flags.go | 9 --- bindings/go/params.go | 7 --- bindings/go/pkg/whisper/context.go | 5 -- bindings/go/pkg/whisper/interface.go | 1 - .../whispercpp/WhisperCppJnaLibrary.java | 10 +-- .../whispercpp/params/WhisperFullParams.java | 10 +-- bindings/ruby/ext/ruby_whisper.cpp | 8 --- bindings/ruby/tests/test_whisper.rb | 7 --- examples/addon.node/addon.cpp | 3 - examples/command/command.cpp | 5 -- examples/common.h | 2 +- examples/lsp/lsp.cpp | 5 -- examples/main/main.cpp | 4 -- examples/server/server.cpp | 4 -- examples/stream/stream.cpp | 4 -- examples/talk-llama/talk-llama.cpp | 4 -- examples/talk/talk.cpp | 4 -- examples/wchess/wchess.cmd/wchess.cmd.cpp | 3 - whisper.cpp | 63 ++++--------------- whisper.h | 17 ----- 20 files changed, 14 insertions(+), 161 deletions(-) diff --git a/bindings/go/examples/go-whisper/flags.go b/bindings/go/examples/go-whisper/flags.go index ea204455c80..766c92f1827 100644 --- a/bindings/go/examples/go-whisper/flags.go +++ b/bindings/go/examples/go-whisper/flags.go @@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string { return strings.ToLower(flags.Lookup("out").Value.String()) } -func (flags *Flags) IsSpeedup() bool { - return flags.Lookup("speedup").Value.String() == "true" -} - func (flags *Flags) IsTokens() bool { return flags.Lookup("tokens").Value.String() == "true" } @@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error { fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration) context.SetDuration(duration) } - if flags.IsSpeedup() { - fmt.Fprintf(flags.Output(), "Setting speedup to true\n") - context.SetSpeedup(true) - } if threads := flags.GetThreads(); threads != 0 { fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads) context.SetThreads(threads) @@ -146,7 +138,6 @@ func registerFlags(flag *Flags) { flag.Duration("offset", 0, "Time offset") flag.Duration("duration", 0, "Duration of audio to process") flag.Uint("threads", 0, "Number of threads to use") - flag.Bool("speedup", false, "Enable speedup") flag.Uint("max-len", 0, "Maximum segment length in characters") flag.Uint("max-tokens", 0, "Maximum tokens per segment") flag.Float64("word-thold", 0, "Maximum segment score") diff --git a/bindings/go/params.go b/bindings/go/params.go index 5931bb0b199..4b4da032d62 100644 --- a/bindings/go/params.go +++ b/bindings/go/params.go @@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) { p.print_timestamps = toBool(v) } -func (p *Params) SetSpeedup(v bool) { - p.speed_up = toBool(v) -} - // Set language id func (p *Params) SetLanguage(lang int) error { if lang == -1 { @@ -177,9 +173,6 @@ func (p *Params) String() string { if p.token_timestamps { str += " token_timestamps" } - if p.speed_up { - str += " speed_up" - } return str + ">" } diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go index 0863ef6bb16..ead92648f3e 100644 --- a/bindings/go/pkg/whisper/context.go +++ b/bindings/go/pkg/whisper/context.go @@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) { context.params.SetTranslate(v) } -// Set speedup flag -func (context *context) SetSpeedup(v bool) { - context.params.SetSpeedup(v) -} - func (context *context) SetSplitOnWord(v bool) { context.params.SetSplitOnWord(v) } diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go index 4339e16f847..b430e7ce853 100644 --- a/bindings/go/pkg/whisper/interface.go +++ b/bindings/go/pkg/whisper/interface.go @@ -41,7 +41,6 @@ type Context interface { SetOffset(time.Duration) // Set offset SetDuration(time.Duration) // Set duration SetThreads(uint) // Set number of threads to use - SetSpeedup(bool) // Set speedup flag SetSplitOnWord(bool) // Set split on word flag SetTokenThreshold(float32) // Set timestamp token probability threshold SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java index 56a37380136..1a73cee1181 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java @@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library { * @return Whisper context on success, null on failure */ Pointer whisper_init_from_file(String path_model); - + /** * Provides default params which can be used with `whisper_init_from_file_with_params()` etc. * Because this function allocates memory for the params, the caller must call either: @@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library { /** Language id associated with the provided state */ int whisper_full_lang_id_from_state(Pointer state); - /** - * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. - * The resulting spectrogram is stored inside the default state of the provided whisper context. - * @return 0 on success - */ - int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads); - - int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads); /** Get the start time of the specified segment. */ long whisper_full_get_segment_t0(Pointer ctx, int i_segment); diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java index 60d8334b935..90d8c15767c 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java @@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) { /** Maximum tokens per segment (0, default = no limit) */ public int max_tokens; - /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */ - public CBool speed_up; - - /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */ - public void speedUp(boolean enable) { - speed_up = enable ? CBool.TRUE : CBool.FALSE; - } - /** Overwrite the audio context size (0 = use default). */ public int audio_ctx; @@ -321,7 +313,7 @@ protected List getFieldOrder() { return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate", "no_context", "single_segment", "no_timestamps", "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps", - "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx", + "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx", "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language", "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty", "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search", diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp index 86af9391e2c..9d9334539b8 100644 --- a/bindings/ruby/ext/ruby_whisper.cpp +++ b/bindings/ruby/ext/ruby_whisper.cpp @@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) { static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) { BOOL_PARAMS_SETTER(self, split_on_word, value) } -static VALUE ruby_whisper_params_get_speed_up(VALUE self) { - BOOL_PARAMS_GETTER(self, speed_up) -} -static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) { - BOOL_PARAMS_SETTER(self, speed_up, value) -} static VALUE ruby_whisper_params_get_diarize(VALUE self) { ruby_whisper_params *rwp; Data_Get_Struct(self, ruby_whisper_params, rwp); @@ -408,8 +402,6 @@ void Init_whisper() { rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1); rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0); rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1); - rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0); - rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1); rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0); rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1); diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/tests/test_whisper.rb index fa6a3e2d4e8..3700671bce6 100644 --- a/bindings/ruby/tests/test_whisper.rb +++ b/bindings/ruby/tests/test_whisper.rb @@ -117,13 +117,6 @@ def test_split_on_word assert !@params.split_on_word end - def test_speed_up - @params.speed_up = true - assert @params.speed_up - @params.speed_up = false - assert !@params.speed_up - end - def test_whisper @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin')) params = Whisper::Params.new diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 53bf1abb5a3..4ada6ca5084 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -25,7 +25,6 @@ struct whisper_params { float entropy_thold = 2.4f; float logprob_thold = -1.0f; - bool speed_up = false; bool translate = false; bool diarize = false; bool output_txt = false; @@ -232,8 +231,6 @@ int run(whisper_params ¶ms, std::vector> &result) { wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; - wparams.greedy.best_of = params.best_of; wparams.beam_search.beam_size = params.beam_size; diff --git a/examples/command/command.cpp b/examples/command/command.cpp index cd6cc023994..84424d4331b 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -38,7 +38,6 @@ struct whisper_params { grammar_parser::parse_state grammar_parsed; - bool speed_up = false; bool translate = false; bool print_special = false; bool print_energy = false; @@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } @@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); @@ -165,7 +162,6 @@ std::string transcribe( wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.temperature = 0.4f; wparams.temperature_inc = 1.0f; @@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.prompt_tokens = k_tokens.data(); wparams.prompt_n_tokens = k_tokens.size(); diff --git a/examples/common.h b/examples/common.h index 2ed91ca9aa8..de895858ab0 100644 --- a/examples/common.h +++ b/examples/common.h @@ -185,7 +185,7 @@ class wav_writer { // It is assumed that PCM data is normalized to a range from -1 to 1 bool write_audio(const float * data, size_t length) { for (size_t i = 0; i < length; ++i) { - const int16_t intSample = data[i] * 32767; + const int16_t intSample = int16_t(data[i] * 32767); file.write(reinterpret_cast(&intSample), sizeof(int16_t)); dataSize += sizeof(int16_t); } diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp index 3df54266a25..8cca87151bf 100644 --- a/examples/lsp/lsp.cpp +++ b/examples/lsp/lsp.cpp @@ -26,7 +26,6 @@ struct whisper_params { float vad_thold = 0.6f; float freq_thold = 100.0f; - bool speed_up = false; bool translate = false; bool print_special = false; bool print_energy = false; @@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } @@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); @@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.suppress_non_speech_tokens = true; // run the transformer and a single decoding pass if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { @@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; // TODO: Do some time testing. Does an overly long prompt slow down processing? // Set up command sets/precompute prompts diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 45eb17fe7f3..bb9b7b79ce5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -47,7 +47,6 @@ struct whisper_params { float temperature = 0.0f; float temperature_inc = 0.2f; - bool speed_up = false; bool debug_mode = false; bool translate = false; bool detect_language = false; @@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); } else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); } else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); } - // else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } @@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature); fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc); - // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); @@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) { wparams.split_on_word = params.split_on_word; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.debug_mode = params.debug_mode; wparams.tdrz_enable = params.tinydiarize; // [TDRZ] diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2efa4c7a020..10aae9c04d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,7 +61,6 @@ struct whisper_params { float temperature = 0.00f; float temperature_inc = 0.20f; - bool speed_up = false; bool debug_mode = false; bool translate = false; bool detect_language = false; @@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold); fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); - // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); @@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); } else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); } else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); } - // else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } @@ -768,7 +765,6 @@ int main(int argc, char ** argv) { wparams.split_on_word = params.split_on_word; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.debug_mode = params.debug_mode; wparams.tdrz_enable = params.tinydiarize; // [TDRZ] diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 60c1b0894e4..50797e96daa 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -27,7 +27,6 @@ struct whisper_params { float vad_thold = 0.6f; float freq_thold = 100.0f; - bool speed_up = false; bool translate = false; bool no_fallback = false; bool print_special = false; @@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } @@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); @@ -314,7 +311,6 @@ int main(int argc, char ** argv) { wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; wparams.tdrz_enable = params.tinydiarize; // [TDRZ] diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 4aab62b9a6f..b15be0b2789 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -59,7 +59,6 @@ struct whisper_params { float vad_thold = 0.6f; float freq_thold = 100.0f; - bool speed_up = false; bool translate = false; bool print_special = false; bool print_energy = false; @@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } @@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); @@ -205,7 +202,6 @@ std::string transcribe( wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size(); wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { return ""; diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index 3e34e5724ff..b34fad6c2bb 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -26,7 +26,6 @@ struct whisper_params { float vad_thold = 0.6f; float freq_thold = 100.0f; - bool speed_up = false; bool translate = false; bool print_special = false; bool print_energy = false; @@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } @@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); @@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; - wparams.speed_up = params.speed_up; if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { return ""; diff --git a/examples/wchess/wchess.cmd/wchess.cmd.cpp b/examples/wchess/wchess.cmd/wchess.cmd.cpp index 09e53f13172..4d049976315 100644 --- a/examples/wchess/wchess.cmd/wchess.cmd.cpp +++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp @@ -26,7 +26,6 @@ struct whisper_params { float grammar_penalty = 100.0f; - bool speed_up = false; bool translate = false; bool print_special = false; bool print_energy = false; @@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); @@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } diff --git a/whisper.cpp b/whisper.cpp index a22da8896bb..dbb235e9f43 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2868,13 +2868,10 @@ struct whisper_global_cache { // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 float hann_window[WHISPER_N_FFT]; - float hann_window2x[WHISPER_N_FFT * 2]; whisper_global_cache() { fill_sin_cos_table(); -#define FILL_HANN_WINDOW(arr) fill_hann_window(sizeof(arr) / sizeof(arr[0]), true, arr) - FILL_HANN_WINDOW(hann_window); - FILL_HANN_WINDOW(hann_window2x); + fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window); } void fill_sin_cos_table() { @@ -2885,7 +2882,7 @@ struct whisper_global_cache { } } - void fill_hann_window(int length, bool periodic, float* output) { + void fill_hann_window(int length, bool periodic, float * output) { int offset = -1; if (periodic) { offset = 0; @@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram( const int64_t t_start_us = ggml_time_us(); // Hann window - const float * hann = nullptr; - if (frame_size == WHISPER_N_FFT) { - hann = global_cache.hann_window; - } else if (frame_size == 2 * WHISPER_N_FFT) { - hann = global_cache.hann_window2x; - } else { - WHISPER_ASSERT(false && "Unsupported frame_size"); - return false; - } + WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size"); + const float * hann = global_cache.hann_window; // Calculate the length of padding int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; @@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads); } -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) -int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { - WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__); - return -1; - } - - return 0; -} - -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) -int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { - return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads); -} - -// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2 -// TODO - -// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2 -// TODO - -// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2 -// TODO - int whisper_set_mel_with_state( struct whisper_context * ctx, struct whisper_state * state, @@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.split_on_word =*/ false, /*.max_tokens =*/ 0, - /*.speed_up =*/ false, /*.debug_mode =*/ false, /*.audio_ctx =*/ 0, @@ -5350,15 +5315,9 @@ int whisper_full_with_state( if (n_samples > 0) { // compute log mel spectrogram - if (params.speed_up) { - // TODO: Replace PV with more advanced algorithm + if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__); - return -1; - } else { - if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { - WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__); - return -2; - } + return -2; } } @@ -5395,7 +5354,7 @@ int whisper_full_with_state( // if length of spectrogram is less than 1.0s (100 frames), then return // basically don't process anything that is less than 1.0s // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39 - if (seek_end < seek_start + (params.speed_up ? 50 : 100)) { + if (seek_end < seek_start + 100) { WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10); return 0; } @@ -6107,8 +6066,8 @@ int whisper_full_with_state( const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx)); if (!text.empty()) { - const auto tt0 = params.speed_up ? 2*t0 : t0; - const auto tt1 = params.speed_up ? 2*t1 : t1; + const auto tt0 = t0; + const auto tt1 = t1; if (params.print_realtime) { if (params.print_timestamps) { @@ -6154,8 +6113,8 @@ int whisper_full_with_state( if (!text.empty()) { const auto t1 = seek + seek_delta; - const auto tt0 = params.speed_up ? 2*t0 : t0; - const auto tt1 = params.speed_up ? 2*t1 : t1; + const auto tt0 = t0; + const auto tt1 = t1; if (params.print_realtime) { if (params.print_timestamps) { diff --git a/whisper.h b/whisper.h index 9c7c58d874b..2b3d5e574cb 100644 --- a/whisper.h +++ b/whisper.h @@ -266,22 +266,6 @@ extern "C" { int n_samples, int n_threads); - // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. - // The resulting spectrogram is stored inside the default state of the provided whisper context. - // Returns 0 on success - WHISPER_API int whisper_pcm_to_mel_phase_vocoder( - struct whisper_context * ctx, - const float * samples, - int n_samples, - int n_threads); - - WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state( - struct whisper_context * ctx, - struct whisper_state * state, - const float * samples, - int n_samples, - int n_threads); - // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. // n_mel must be 80 @@ -499,7 +483,6 @@ extern "C" { // [EXPERIMENTAL] speed-up techniques // note: these can significantly reduce the quality of the output - bool speed_up; // speed-up the audio by 2x using Phase Vocoder bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel) int audio_ctx; // overwrite the audio context size (0 = use default)