Skip to content

Commit

Permalink
Merge branch 'master' into fix-decoding
Browse files Browse the repository at this point in the history
  • Loading branch information
bobqianic authored Jan 31, 2024
2 parents cda677e + 7a74e92 commit 4b079bf
Show file tree
Hide file tree
Showing 30 changed files with 3,715 additions and 775 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,8 @@ if (WHISPER_ALL_WARNINGS)
endif()

if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
# TODO: temporary disabled until we figure out ggml-metal.m
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
endif()

Expand Down Expand Up @@ -505,6 +506,7 @@ else()
endif()

if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC
${CMAKE_DL_LIBS}
)
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
else ifeq ($(UNAME_S),Linux)
CPUINFO_CMD := cat /proc/cpuinfo
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
CPUINFO_CMD := cat /proc/cpuinfo
else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
CPUINFO_CMD := grep Features /var/run/dmesg.boot
Expand Down
140 changes: 73 additions & 67 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions bindings/javascript/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ make publish-npm

## Sample run

```java
```text
$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
whisper_model_load: loading model from 'whisper.bin'
Expand All @@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size = 140.60 MB
whisper_model_load: memory size = 22.83 MB
whisper_model_load: model size = 140.54 MB
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
Expand Down
2 changes: 2 additions & 0 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_FTYPE_MOSTLY_IQ2_XXS:
case GGML_FTYPE_MOSTLY_IQ2_XS:
case GGML_FTYPE_MOSTLY_IQ3_XXS:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -195,6 +196,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q8_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
18 changes: 17 additions & 1 deletion examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(

}


namespace utf_8 {
bool is_valid(const std::string &str) {
uint64_t count = 0; // Count of bytes in the current UTF-8 character
Expand Down Expand Up @@ -665,6 +666,21 @@ namespace utf_8 {
}
}

bool is_wav_buffer(const std::string buf) {
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
return false;
}

uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
if (chunk_size + 8 != buf.size()) {
return false;
}

return true;
}

bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
Expand All @@ -689,7 +705,7 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (fname.size() > 256 || fname.size() > 40 && fname.substr(0, 4) == "RIFF" && fname.substr(8, 4) == "WAVE") {
else if (is_wav_buffer(fname)) {
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
return false;
Expand Down
3 changes: 3 additions & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ namespace utf_8{
// Audio utils
//

// Check if a buffer is a WAV audio file
bool is_wav_buffer(const std::string buf);

// Read WAV audio file and store the PCM data into pcmf32
// fname can be a buffer of WAV data instead of a filename
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
Expand Down
89 changes: 87 additions & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
{"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}});

std::string const default_content = "<html>hello</html>";
std::string const default_content = R"(
<html>
<head>
<title>Whisper.cpp Server</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<style>
body {
font-family: sans-serif;
}
form {
display: flex;
flex-direction: column;
align-items: flex-start;
}
label {
margin-bottom: 0.5rem;
}
input, select {
margin-bottom: 1rem;
}
button {
margin-top: 1rem;
}
</style>
</head>
<body>
<h1>Whisper.cpp Server</h1>
<h2>/inference</h2>
<pre>
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
-H "Content-Type: multipart/form-data" \
-F file="@&lt;file-path&gt;" \
-F temperature="0.0" \
-F temperature_inc="0.2" \
-F response_format="json"
</pre>
<h2>/load</h2>
<pre>
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
-H "Content-Type: multipart/form-data" \
-F model="&lt;path-to-model-file&gt;"
</pre>
<div>
<h2>Try it out</h2>
<form action="/inference" method="POST" enctype="multipart/form-data">
<label for="file">Choose an audio file:</label>
<input type="file" id="file" name="file" accept="audio/*" required><br>
<label for="temperature">Temperature:</label>
<input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
<label for="response_format">Response Format:</label>
<select id="response_format" name="response_format">
<option value="verbose_json">Verbose JSON</option>
<option value="json">JSON</option>
<option value="text">Text</option>
<option value="srt">SRT</option>
<option value="vtt">VTT</option>
</select><br>
<button type="submit">Submit</button>
</form>
</div>
</body>
</html>
)";

// store default params so we can reset after each inference request
whisper_params default_params = params;
Expand Down Expand Up @@ -787,7 +856,13 @@ int main(int argc, char ** argv) {
} else if (params.response_format == vjson_format) {
/* try to match openai/whisper's Python format */
std::string results = output_str(ctx, params, pcmf32s);
json jres = json{{"text", results}};
json jres = json{
{"task", params.translate ? "translate" : "transcribe"},
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
{"text", results},
{"segments", json::array()}
};
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i)
{
Expand All @@ -801,6 +876,7 @@ int main(int argc, char ** argv) {
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
}

float total_logprob = 0;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
Expand All @@ -815,8 +891,17 @@ int main(int argc, char ** argv) {
word["end"] = token.t1 * 0.01;
}
word["probability"] = token.p;
total_logprob += token.plog;
segment["words"].push_back(word);
}

segment["temperature"] = params.temperature;
segment["avg_logprob"] = total_logprob / n_tokens;

// TODO compression_ratio and no_speech_prob are not implemented yet
// segment["compression_ratio"] = 0;
// segment["no_speech_prob"] = 0;

jres["segments"].push_back(segment);
}
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
Expand Down
8 changes: 4 additions & 4 deletions examples/stream/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This is a naive example of performing real-time inference on audio from your mic
The `stream` tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

```java
```bash
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```

Expand All @@ -14,7 +14,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a

Setting the `--step` argument to `0` enables the sliding window mode:

```java
```bash
./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
```

Expand All @@ -39,8 +39,8 @@ brew install sdl2
make stream
```

Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
`examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.

```bash
Expand Down
Loading

0 comments on commit 4b079bf

Please sign in to comment.