-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
36,087 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
set(TARGET main) | ||
add_executable(${TARGET} main.cpp) | ||
|
||
include(DefaultTargetOptions) | ||
|
||
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# main | ||
|
||
This is the main example demonstrating most of the functionality of the Whisper model. | ||
It can be used as a reference for using the `whisper.cpp` library in other projects. | ||
|
||
``` | ||
./main -h | ||
usage: ./main [options] file0.wav file1.wav ... | ||
options: | ||
-h, --help [default] show this help message and exit | ||
-t N, --threads N [4 ] number of threads to use during computation | ||
-p N, --processors N [1 ] number of processors to use during computation | ||
-ot N, --offset-t N [0 ] time offset in milliseconds | ||
-on N, --offset-n N [0 ] segment index offset | ||
-d N, --duration N [0 ] duration of audio to process in milliseconds | ||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store | ||
-ml N, --max-len N [0 ] maximum segment length in characters | ||
-sow, --split-on-word [false ] split on word rather than on token | ||
-bo N, --best-of N [5 ] number of best candidates to keep | ||
-bs N, --beam-size N [5 ] beam size for beam search | ||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold | ||
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail | ||
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail | ||
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel) | ||
-tr, --translate [false ] translate from source language to english | ||
-di, --diarize [false ] stereo audio diarization | ||
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model) | ||
-nf, --no-fallback [false ] do not use temperature fallback while decoding | ||
-otxt, --output-txt [false ] output result in a text file | ||
-ovtt, --output-vtt [false ] output result in a vtt file | ||
-osrt, --output-srt [false ] output result in a srt file | ||
-olrc, --output-lrc [false ] output result in a lrc file | ||
-owts, --output-words [false ] output script for generating karaoke video | ||
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video | ||
-ocsv, --output-csv [false ] output result in a CSV file | ||
-oj, --output-json [false ] output result in a JSON file | ||
-ojf, --output-json-full [false ] include more information in the JSON file | ||
-of FNAME, --output-file FNAME [ ] output file path (without file extension) | ||
-ps, --print-special [false ] print special tokens | ||
-pc, --print-colors [false ] print colors | ||
-pp, --print-progress [false ] print progress | ||
-nt, --no-timestamps [false ] do not print timestamps | ||
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect) | ||
-dl, --detect-language [false ] exit after automatically detecting language | ||
--prompt PROMPT [ ] initial prompt | ||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path | ||
-f FNAME, --file FNAME [ ] input WAV file path | ||
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference | ||
-ls, --log-score [false ] log best decoder scores of tokens | ||
-ng, --no-gpu [false ] disable GPU | ||
``` |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
set(TARGET server) | ||
add_executable(${TARGET} server.cpp httplib.h json.hpp) | ||
|
||
include(DefaultTargetOptions) | ||
|
||
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) | ||
|
||
# Check if the compiler is MinGW | ||
if(MINGW) | ||
# Link the necessary libraries for SSL and Winsock | ||
target_link_libraries(${TARGET} PRIVATE -lcrypt32 -lssl -lcrypto -lws2_32) | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# whisper.cpp http server | ||
|
||
Simple http server. WAV Files are passed to the inference model via http requests. | ||
|
||
https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-afe5e4594b8f | ||
|
||
## Usage | ||
|
||
``` | ||
./server -h | ||
usage: ./bin/server [options] | ||
options: | ||
-h, --help [default] show this help message and exit | ||
-t N, --threads N [4 ] number of threads to use during computation | ||
-p N, --processors N [1 ] number of processors to use during computation | ||
-ot N, --offset-t N [0 ] time offset in milliseconds | ||
-on N, --offset-n N [0 ] segment index offset | ||
-d N, --duration N [0 ] duration of audio to process in milliseconds | ||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store | ||
-ml N, --max-len N [0 ] maximum segment length in characters | ||
-sow, --split-on-word [false ] split on word rather than on token | ||
-bo N, --best-of N [2 ] number of best candidates to keep | ||
-bs N, --beam-size N [-1 ] beam size for beam search | ||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold | ||
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail | ||
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail | ||
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel) | ||
-tr, --translate [false ] translate from source language to english | ||
-di, --diarize [false ] stereo audio diarization | ||
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model) | ||
-nf, --no-fallback [false ] do not use temperature fallback while decoding | ||
-ps, --print-special [false ] print special tokens | ||
-pc, --print-colors [false ] print colors | ||
-pr, --print-realtime [false ] print output in realtime | ||
-pp, --print-progress [false ] print progress | ||
-nt, --no-timestamps [false ] do not print timestamps | ||
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect) | ||
-dl, --detect-language [false ] exit after automatically detecting language | ||
--prompt PROMPT [ ] initial prompt | ||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path | ||
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference | ||
--host HOST, [127.0.0.1] Hostname/ip-adress for the server | ||
--port PORT, [8080 ] Port number for the server | ||
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server | ||
``` | ||
|
||
> [!WARNING] | ||
> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.** | ||
## request examples | ||
|
||
**/inference** | ||
``` | ||
curl 127.0.0.1:8080/inference \ | ||
-H "Content-Type: multipart/form-data" \ | ||
-F file="@<file-path>" \ | ||
-F temperature="0.0" \ | ||
-F temperature_inc="0.2" \ | ||
-F response_format="json" | ||
``` | ||
|
||
**/load** | ||
``` | ||
curl 127.0.0.1:8080/load \ | ||
-H "Content-Type: multipart/form-data" \ | ||
-F model="<path-to-model-file>" | ||
``` |
Oops, something went wrong.