From af5833e29819810f2d83228228a9a3077e5ccd93 Mon Sep 17 00:00:00 2001
From: Borislav Stanimirov <b.stanimirov@abv.bg>
Date: Fri, 31 May 2024 11:37:29 +0300
Subject: [PATCH] whisper : remove `speed_up` and `phase_vocoder*` functions
 (#2198)

* whisper : fix cast warning

* whisper : remove phase_vocoder functions, ref #2195

* whisper : remove speed_up from whisper_full_params, closes #2195
---
 bindings/go/examples/go-whisper/flags.go      |  9 ---
 bindings/go/params.go                         |  7 ---
 bindings/go/pkg/whisper/context.go            |  5 --
 bindings/go/pkg/whisper/interface.go          |  1 -
 .../whispercpp/WhisperCppJnaLibrary.java      | 10 +--
 .../whispercpp/params/WhisperFullParams.java  | 10 +--
 bindings/ruby/ext/ruby_whisper.cpp            |  8 ---
 bindings/ruby/tests/test_whisper.rb           |  7 ---
 examples/addon.node/addon.cpp                 |  3 -
 examples/command/command.cpp                  |  5 --
 examples/common.h                             |  2 +-
 examples/lsp/lsp.cpp                          |  5 --
 examples/main/main.cpp                        |  4 --
 examples/server/server.cpp                    |  4 --
 examples/stream/stream.cpp                    |  4 --
 examples/talk-llama/talk-llama.cpp            |  4 --
 examples/talk/talk.cpp                        |  4 --
 examples/wchess/wchess.cmd/wchess.cmd.cpp     |  3 -
 whisper.cpp                                   | 63 ++++---------------
 whisper.h                                     | 17 -----
 20 files changed, 14 insertions(+), 161 deletions(-)

diff --git a/bindings/go/examples/go-whisper/flags.go b/bindings/go/examples/go-whisper/flags.go
index ea204455c80..766c92f1827 100644
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
 	return strings.ToLower(flags.Lookup("out").Value.String())
 }
 
-func (flags *Flags) IsSpeedup() bool {
-	return flags.Lookup("speedup").Value.String() == "true"
-}
-
 func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
 		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
 		context.SetDuration(duration)
 	}
-	if flags.IsSpeedup() {
-		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
-		context.SetSpeedup(true)
-	}
 	if threads := flags.GetThreads(); threads != 0 {
 		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
 		context.SetThreads(threads)
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
 	flag.Duration("offset", 0, "Time offset")
 	flag.Duration("duration", 0, "Duration of audio to process")
 	flag.Uint("threads", 0, "Number of threads to use")
-	flag.Bool("speedup", false, "Enable speedup")
 	flag.Uint("max-len", 0, "Maximum segment length in characters")
 	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
 	flag.Float64("word-thold", 0, "Maximum segment score")
diff --git a/bindings/go/params.go b/bindings/go/params.go
index 5931bb0b199..4b4da032d62 100644
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 
-func (p *Params) SetSpeedup(v bool) {
-	p.speed_up = toBool(v)
-}
-
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
@@ -177,9 +173,6 @@ func (p *Params) String() string {
 	if p.token_timestamps {
 		str += " token_timestamps"
 	}
-	if p.speed_up {
-		str += " speed_up"
-	}
 
 	return str + ">"
 }
diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go
index 0863ef6bb16..ead92648f3e 100644
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
 }
 
-// Set speedup flag
-func (context *context) SetSpeedup(v bool) {
-	context.params.SetSpeedup(v)
-}
-
 func (context *context) SetSplitOnWord(v bool) {
 	context.params.SetSplitOnWord(v)
 }
diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go
index 4339e16f847..b430e7ce853 100644
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@@ -41,7 +41,6 @@ type Context interface {
 	SetOffset(time.Duration)        // Set offset
 	SetDuration(time.Duration)      // Set duration
 	SetThreads(uint)                // Set number of threads to use
-	SetSpeedup(bool)                // Set speedup flag
 	SetSplitOnWord(bool)            // Set split on word flag
 	SetTokenThreshold(float32)      // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
index 56a37380136..1a73cee1181 100644
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
      * @return Whisper context on success, null on failure
      */
     Pointer whisper_init_from_file(String path_model);
-    
+
     /**
      * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
      * Because this function allocates memory for the params, the caller must call either:
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
     /** Language id associated with the provided state */
     int whisper_full_lang_id_from_state(Pointer state);
 
-    /**
-     * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-     * The resulting spectrogram is stored inside the default state of the provided whisper context.
-     * @return 0 on success
-     */
-    int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
-
-    int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
 
     /** Get the start time of the specified segment. */
     long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
index 60d8334b935..90d8c15767c 100644
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) {
     /** Maximum tokens per segment (0, default = no limit) */
     public int max_tokens;
 
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public CBool speed_up;
-
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public void speedUp(boolean enable) {
-        speed_up = enable ? CBool.TRUE : CBool.FALSE;
-    }
-
     /** Overwrite the audio context size (0 = use default). */
     public int audio_ctx;
 
@@ -321,7 +313,7 @@ protected List<String> getFieldOrder() {
         return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
                 "no_context", "single_segment", "no_timestamps",
                 "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
+                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
                 "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                 "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
                 "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp
index 86af9391e2c..9d9334539b8 100644
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
 static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
   BOOL_PARAMS_SETTER(self, split_on_word, value)
 }
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
 static VALUE ruby_whisper_params_get_diarize(VALUE self) {
   ruby_whisper_params *rwp;
   Data_Get_Struct(self, ruby_whisper_params, rwp);
@@ -408,8 +402,6 @@ void Init_whisper() {
   rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
   rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
   rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
   rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
   rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
 
diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/tests/test_whisper.rb
index fa6a3e2d4e8..3700671bce6 100644
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@@ -117,13 +117,6 @@ def test_split_on_word
     assert !@params.split_on_word
   end
 
-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
   def test_whisper
     @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
     params  = Whisper::Params.new
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index 53bf1abb5a3..4ada6ca5084 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -25,7 +25,6 @@ struct whisper_params {
     float entropy_thold = 2.4f;
     float logprob_thold = -1.0f;
 
-    bool speed_up       = false;
     bool translate      = false;
     bool diarize        = false;
     bool output_txt     = false;
@@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
-
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
 
diff --git a/examples/command/command.cpp b/examples/command/command.cpp
index cd6cc023994..84424d4331b 100644
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@@ -38,7 +38,6 @@ struct whisper_params {
 
     grammar_parser::parse_state grammar_parsed;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -165,7 +162,6 @@ std::string transcribe(
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx = params.audio_ctx;
-    wparams.speed_up  = params.speed_up;
 
     wparams.temperature     = 0.4f;
     wparams.temperature_inc = 1.0f;
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
             wparams.n_threads        = params.n_threads;
 
             wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;
 
             wparams.prompt_tokens    = k_tokens.data();
             wparams.prompt_n_tokens  = k_tokens.size();
diff --git a/examples/common.h b/examples/common.h
index 2ed91ca9aa8..de895858ab0 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -185,7 +185,7 @@ class wav_writer {
     // It is assumed that PCM data is normalized to a range from -1 to 1
     bool write_audio(const float * data, size_t length) {
         for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
             file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
             dataSize += sizeof(int16_t);
         }
diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp
index 3df54266a25..8cca87151bf 100644
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
     float vad_thold    = 0.6f;
     float freq_thold   = 100.0f;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
     wparams.suppress_non_speech_tokens = true;
     // run the transformer and a single decoding pass
     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
 
     // TODO: Do some time testing. Does an overly long prompt slow down processing?
     // Set up command sets/precompute prompts
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 45eb17fe7f3..bb9b7b79ce5 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -47,7 +47,6 @@ struct whisper_params {
     float temperature     = 0.0f;
     float temperature_inc = 0.2f;
 
-    bool speed_up        = false;
     bool debug_mode      = false;
     bool translate       = false;
     bool detect_language = false;
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
         else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(argv[++i]); }
         else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
         else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
         else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
         else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
     fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
     fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
             wparams.split_on_word    = params.split_on_word;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
             wparams.debug_mode       = params.debug_mode;
 
             wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2efa4c7a020..10aae9c04d3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,6 @@ struct whisper_params {
     float temperature     =  0.00f;
     float temperature_inc =  0.20f;
 
-    bool speed_up        = false;
     bool debug_mode      = false;
     bool translate       = false;
     bool detect_language = false;
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
     fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
     fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
         else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
         else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
         else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
         else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
         else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
             wparams.split_on_word    = params.split_on_word;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
             wparams.debug_mode       = params.debug_mode;
 
             wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 60c1b0894e4..50797e96daa 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -27,7 +27,6 @@ struct whisper_params {
     float vad_thold    = 0.6f;
     float freq_thold   = 100.0f;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool no_fallback   = false;
     bool print_special = false;
@@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
         else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
@@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
     fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",           params.vad_thold);
     fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                   params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
     fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
@@ -314,7 +311,6 @@ int main(int argc, char ** argv) {
             wparams.n_threads        = params.n_threads;
 
             wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;
 
             wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
 
diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp
index 4aab62b9a6f..b15be0b2789 100644
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -59,7 +59,6 @@ struct whisper_params {
     float vad_thold  = 0.6f;
     float freq_thold = 100.0f;
 
-    bool speed_up       = false;
     bool translate      = false;
     bool print_special  = false;
     bool print_energy   = false;
@@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
         else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
         else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
         else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
@@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
     fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -205,7 +202,6 @@ std::string transcribe(
     wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
 
     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
         return "";
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp
index 3e34e5724ff..b34fad6c2bb 100644
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
     float vad_thold    = 0.6f;
     float freq_thold   = 100.0f;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
 
     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
         return "";
diff --git a/examples/wchess/wchess.cmd/wchess.cmd.cpp b/examples/wchess/wchess.cmd/wchess.cmd.cpp
index 09e53f13172..4d049976315 100644
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
 
     float grammar_penalty = 100.0f;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
diff --git a/whisper.cpp b/whisper.cpp
index a22da8896bb..dbb235e9f43 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2868,13 +2868,10 @@ struct whisper_global_cache {
     // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
     // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
     float hann_window[WHISPER_N_FFT];
-    float hann_window2x[WHISPER_N_FFT * 2];
 
     whisper_global_cache() {
         fill_sin_cos_table();
-#define FILL_HANN_WINDOW(arr) fill_hann_window(sizeof(arr) / sizeof(arr[0]), true, arr)
-        FILL_HANN_WINDOW(hann_window);
-        FILL_HANN_WINDOW(hann_window2x);
+        fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
     }
 
     void fill_sin_cos_table() {
@@ -2885,7 +2882,7 @@ struct whisper_global_cache {
         }
     }
 
-    void fill_hann_window(int length, bool periodic, float* output) {
+    void fill_hann_window(int length, bool periodic, float * output) {
         int offset = -1;
         if (periodic) {
             offset = 0;
@@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram(
     const int64_t t_start_us = ggml_time_us();
 
     // Hann window
-    const float * hann = nullptr;
-    if (frame_size == WHISPER_N_FFT) {
-        hann = global_cache.hann_window;
-    } else if (frame_size == 2 * WHISPER_N_FFT) {
-        hann = global_cache.hann_window2x;
-    } else {
-        WHISPER_ASSERT(false && "Unsupported frame_size");
-        return false;
-    }
+    WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
+    const float * hann = global_cache.hann_window;
 
     // Calculate the length of padding
     int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
@@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
     return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
 }
 
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
-        WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
-// TODO
-
 int whisper_set_mel_with_state(
         struct whisper_context * ctx,
           struct whisper_state * state,
@@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.split_on_word     =*/ false,
         /*.max_tokens        =*/ 0,
 
-        /*.speed_up          =*/ false,
         /*.debug_mode        =*/ false,
         /*.audio_ctx         =*/ 0,
 
@@ -5350,15 +5315,9 @@ int whisper_full_with_state(
 
     if (n_samples > 0) {
         // compute log mel spectrogram
-        if (params.speed_up) {
-            // TODO: Replace PV with more advanced algorithm
+        if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
             WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-            return -1;
-        } else {
-            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
-                WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-                return -2;
-            }
+            return -2;
         }
     }
 
@@ -5395,7 +5354,7 @@ int whisper_full_with_state(
     // if length of spectrogram is less than 1.0s (100 frames), then return
     // basically don't process anything that is less than 1.0s
     // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
+    if (seek_end < seek_start + 100) {
         WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
         return 0;
     }
@@ -6107,8 +6066,8 @@ int whisper_full_with_state(
                         const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
 
                         if (!text.empty()) {
-                            const auto tt0 = params.speed_up ? 2*t0 : t0;
-                            const auto tt1 = params.speed_up ? 2*t1 : t1;
+                            const auto tt0 = t0;
+                            const auto tt1 = t1;
 
                             if (params.print_realtime) {
                                 if (params.print_timestamps) {
@@ -6154,8 +6113,8 @@ int whisper_full_with_state(
                 if (!text.empty()) {
                     const auto t1 = seek + seek_delta;
 
-                    const auto tt0 = params.speed_up ? 2*t0 : t0;
-                    const auto tt1 = params.speed_up ? 2*t1 : t1;
+                    const auto tt0 = t0;
+                    const auto tt1 = t1;
 
                     if (params.print_realtime) {
                         if (params.print_timestamps) {
diff --git a/whisper.h b/whisper.h
index 9c7c58d874b..2b3d5e574cb 100644
--- a/whisper.h
+++ b/whisper.h
@@ -266,22 +266,6 @@ extern "C" {
                                int   n_samples,
                                int   n_threads);
 
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
     // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
     // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
     // n_mel must be 80
@@ -499,7 +483,6 @@ extern "C" {
 
         // [EXPERIMENTAL] speed-up techniques
         // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
         bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
         int  audio_ctx;         // overwrite the audio context size (0 = use default)