add streaming zipformer CTC models doc (#524)

k2-fsa · Dec 21, 2023 · 009ade8 · 009ade8
1 parent 27cebee
commit 009ade8
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 0 deletions.
diff --git a/docs/source/onnx/pretrained_models/index.rst b/docs/source/onnx/pretrained_models/index.rst
@@ -17,6 +17,7 @@ available pre-trained models.
 
    online-transducer/index
    online-paraformer/index
+   online-ctc/index
    offline-transducer/index
    offline-paraformer/index
    offline-ctc/index

diff --git a/...-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt b/...-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt
@@ -0,0 +1,8 @@
+/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav 
+
+OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx"), tokens="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search")
+./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
+Elapsed seconds: 0.44, Real time factor (RTF): 0.078
+ 对我做了介绍那么我想说的是大家如果对我的研究感兴趣
+{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.00, 0.52, 0.76, 0.84, 1.04, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.84], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]}
+
diff --git a/...nline-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt b/...nline-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt
@@ -0,0 +1,8 @@
+/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav 
+
+OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx"), tokens="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search")
+./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
+Elapsed seconds: 0.66, Real time factor (RTF): 0.12
+ 对我做了介绍那么我想说的是大家如果对我的研究感兴趣
+{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.00, 0.52, 0.76, 0.84, 1.08, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.80], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]}
+
diff --git a/docs/source/onnx/pretrained_models/online-ctc/index.rst b/docs/source/onnx/pretrained_models/online-ctc/index.rst
@@ -0,0 +1,11 @@
+.. _onnx_online_ctc_models:
+
+Online CTC models
+=================
+
+This section lists available online CTC models.
+
+.. toctree::
+   :maxdepth: 5
+
+   zipformer-ctc-models
diff --git a/docs/source/onnx/pretrained_models/online-ctc/zipformer-ctc-models.rst b/docs/source/onnx/pretrained_models/online-ctc/zipformer-ctc-models.rst
@@ -0,0 +1,135 @@
+.. _sherpa_onnx_zipformer_ctc_models:
+
+Zipformer-CTC-based Models
+==========================
+
+.. hint::
+
+   Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_
+   before you read this section.
+
+sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 (Chinese)
+----------------------------------------------------------------------
+
+Training code for this model can be found at `<https://github.com/k2-fsa/icefall/pull/1369>`_.
+It supports only Chinese.
+
+Please refer to `<https://github.com/k2-fsa/icefall/tree/master/egs/multi_zh-hans/ASR#included-training-sets>`_
+for the detailed information about the training data. In total, there are 14k hours of training data.
+
+In the following, we describe how to download it and use it with `sherpa-onnx`_.
+
+Download the model
+~~~~~~~~~~~~~~~~~~
+
+Please use the following commands to download it.
+
+.. code-block:: bash
+
+  cd /path/to/sherpa-onnx
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
+  tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
+  rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
+  ls -lh sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+
+The output is given below:
+
+.. code-block::
+
+  $ ls -lh sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+  total 654136
+  -rw-r--r--@ 1 fangjun  staff    28B Dec 13 16:19 README.md
+  -rw-r--r--@ 1 fangjun  staff   258K Dec 13 16:19 bpe.model
+  -rw-r--r--@ 1 fangjun  staff    68M Dec 13 16:19 ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx
+  -rw-r--r--@ 1 fangjun  staff   252M Dec 13 16:19 ctc-epoch-20-avg-1-chunk-16-left-128.onnx
+  drwxr-xr-x@ 8 fangjun  staff   256B Dec 13 16:19 test_wavs
+  -rw-r--r--@ 1 fangjun  staff    18K Dec 13 16:19 tokens.txt
+
+Decode a single wave file
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. hint::
+
+   It supports decoding only wave files of a single channel with 16-bit
+   encoded samples, while the sampling rate does not need to be 16 kHz.
+
+fp32
+^^^^
+
+The following code shows how to use ``fp32`` models to decode a wave file:
+
+.. code-block:: bash
+
+  cd /path/to/sherpa-onnx
+
+  ./build/bin/sherpa-onnx \
+    --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
+    ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
+
+.. note::
+
+   Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows.
+
+.. caution::
+
+   If you use Windows and get encoding issues, please run:
+
+      .. code-block:: bash
+
+          CHCP 65001
+
+   in your commandline.
+
+You should see the following output:
+
+.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt
+
+int8
+^^^^
+
+The following code shows how to use ``int8`` models to decode a wave file:
+
+.. code-block:: bash
+
+  cd /path/to/sherpa-onnx
+
+  ./build/bin/sherpa-onnx \
+    --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
+    ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
+
+.. note::
+
+   Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows.
+
+.. caution::
+
+   If you use Windows and get encoding issues, please run:
+
+      .. code-block:: bash
+
+          CHCP 65001
+
+   in your commandline.
+
+You should see the following output:
+
+.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt
+
+Real-time speech recognition from a microphone
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  cd /path/to/sherpa-onnx
+
+  ./build/bin/sherpa-onnx-microphone \
+    --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt
+
+.. hint::
+
+   If your system is Linux (including embedded Linux), you can also use
+   :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your
+   microphone if ``sherpa-onnx-microphone`` does not work for you.