Skip to content

Commit

Permalink
whisper : remove speed_up and phase_vocoder* functions (#2198)
Browse files Browse the repository at this point in the history
* whisper : fix cast warning

* whisper : remove phase_vocoder functions, ref #2195

* whisper : remove speed_up from whisper_full_params, closes #2195
  • Loading branch information
iboB authored May 31, 2024
1 parent b87494b commit af5833e
Show file tree
Hide file tree
Showing 20 changed files with 14 additions and 161 deletions.
9 changes: 0 additions & 9 deletions bindings/go/examples/go-whisper/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
return strings.ToLower(flags.Lookup("out").Value.String())
}

func (flags *Flags) IsSpeedup() bool {
return flags.Lookup("speedup").Value.String() == "true"
}

func (flags *Flags) IsTokens() bool {
return flags.Lookup("tokens").Value.String() == "true"
}
Expand Down Expand Up @@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
context.SetDuration(duration)
}
if flags.IsSpeedup() {
fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
context.SetSpeedup(true)
}
if threads := flags.GetThreads(); threads != 0 {
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
context.SetThreads(threads)
Expand Down Expand Up @@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
flag.Duration("offset", 0, "Time offset")
flag.Duration("duration", 0, "Duration of audio to process")
flag.Uint("threads", 0, "Number of threads to use")
flag.Bool("speedup", false, "Enable speedup")
flag.Uint("max-len", 0, "Maximum segment length in characters")
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
flag.Float64("word-thold", 0, "Maximum segment score")
Expand Down
7 changes: 0 additions & 7 deletions bindings/go/params.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
p.print_timestamps = toBool(v)
}

func (p *Params) SetSpeedup(v bool) {
p.speed_up = toBool(v)
}

// Set language id
func (p *Params) SetLanguage(lang int) error {
if lang == -1 {
Expand Down Expand Up @@ -177,9 +173,6 @@ func (p *Params) String() string {
if p.token_timestamps {
str += " token_timestamps"
}
if p.speed_up {
str += " speed_up"
}

return str + ">"
}
5 changes: 0 additions & 5 deletions bindings/go/pkg/whisper/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
context.params.SetTranslate(v)
}

// Set speedup flag
func (context *context) SetSpeedup(v bool) {
context.params.SetSpeedup(v)
}

func (context *context) SetSplitOnWord(v bool) {
context.params.SetSplitOnWord(v)
}
Expand Down
1 change: 0 additions & 1 deletion bindings/go/pkg/whisper/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ type Context interface {
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file(String path_model);

/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Because this function allocates memory for the params, the caller must call either:
Expand Down Expand Up @@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
/** Language id associated with the provided state */
int whisper_full_lang_id_from_state(Pointer state);

/**
* Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
* The resulting spectrogram is stored inside the default state of the provided whisper context.
* @return 0 on success
*/
int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);

int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);

/** Get the start time of the specified segment. */
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) {
/** Maximum tokens per segment (0, default = no limit) */
public int max_tokens;

/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
public CBool speed_up;

/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
public void speedUp(boolean enable) {
speed_up = enable ? CBool.TRUE : CBool.FALSE;
}

/** Overwrite the audio context size (0 = use default). */
public int audio_ctx;

Expand Down Expand Up @@ -321,7 +313,7 @@ protected List<String> getFieldOrder() {
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
"no_context", "single_segment", "no_timestamps",
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
Expand Down
8 changes: 0 additions & 8 deletions bindings/ruby/ext/ruby_whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, split_on_word, value)
}
static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
BOOL_PARAMS_GETTER(self, speed_up)
}
static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, speed_up, value)
}
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
Expand Down Expand Up @@ -408,8 +402,6 @@ void Init_whisper() {
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);

Expand Down
7 changes: 0 additions & 7 deletions bindings/ruby/tests/test_whisper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,6 @@ def test_split_on_word
assert !@params.split_on_word
end

def test_speed_up
@params.speed_up = true
assert @params.speed_up
@params.speed_up = false
assert !@params.speed_up
end

def test_whisper
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
Expand Down
3 changes: 0 additions & 3 deletions examples/addon.node/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ struct whisper_params {
float entropy_thold = 2.4f;
float logprob_thold = -1.0f;

bool speed_up = false;
bool translate = false;
bool diarize = false;
bool output_txt = false;
Expand Down Expand Up @@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;

wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;

Expand Down
5 changes: 0 additions & 5 deletions examples/command/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ struct whisper_params {

grammar_parser::parse_state grammar_parsed;

bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
Expand Down Expand Up @@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
Expand Down Expand Up @@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
Expand Down Expand Up @@ -165,7 +162,6 @@ std::string transcribe(
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

wparams.temperature = 0.4f;
wparams.temperature_inc = 1.0f;
Expand Down Expand Up @@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

wparams.prompt_tokens = k_tokens.data();
wparams.prompt_n_tokens = k_tokens.size();
Expand Down
2 changes: 1 addition & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class wav_writer {
// It is assumed that PCM data is normalized to a range from -1 to 1
bool write_audio(const float * data, size_t length) {
for (size_t i = 0; i < length; ++i) {
const int16_t intSample = data[i] * 32767;
const int16_t intSample = int16_t(data[i] * 32767);
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
dataSize += sizeof(int16_t);
}
Expand Down
5 changes: 0 additions & 5 deletions examples/lsp/lsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ struct whisper_params {
float vad_thold = 0.6f;
float freq_thold = 100.0f;

bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
Expand Down Expand Up @@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
Expand Down Expand Up @@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
Expand Down Expand Up @@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
Expand Down Expand Up @@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

// TODO: Do some time testing. Does an overly long prompt slow down processing?
// Set up command sets/precompute prompts
Expand Down
4 changes: 0 additions & 4 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ struct whisper_params {
float temperature = 0.0f;
float temperature_inc = 0.2f;

bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
Expand Down Expand Up @@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
Expand Down Expand Up @@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
Expand Down Expand Up @@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
wparams.split_on_word = params.split_on_word;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;

wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
Expand Down
4 changes: 0 additions & 4 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ struct whisper_params {
float temperature = 0.00f;
float temperature_inc = 0.20f;

bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
Expand Down Expand Up @@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
Expand Down Expand Up @@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
Expand Down Expand Up @@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
wparams.split_on_word = params.split_on_word;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;

wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
Expand Down
Loading

0 comments on commit af5833e

Please sign in to comment.