diff --git a/environment.yml b/environment.yml
index 4c973e5..9dd141f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -18,7 +18,8 @@ dependencies:
- sf2utils
- sphinx
- tqdm
- - TTS
+ - "--editable=git+https://github.com/nateshmbhat/pyttsx3.git#egg=pyttsx3"
+ - pyttsx3
- wavio
- wheel
- sounddevice
\ No newline at end of file
diff --git a/examples/AudioCaption.ipynb b/examples/AudioCaption.ipynb
index 575527b..16bb640 100644
--- a/examples/AudioCaption.ipynb
+++ b/examples/AudioCaption.ipynb
@@ -6,7 +6,9 @@
"metadata": {},
"source": [
"### Generate a sonification with an audio caption in `strauss` \n",
- "Import the relevant modules:"
+ "Import the relevant modules:\n",
+ "\n",
+ "***Note***: you will need to have some form of python text-to-speech installed (`TTS` or `pyttsx3`) for these examples to work. See the error raised when trying to run the examples below for more info:"
]
},
{
@@ -27,7 +29,28 @@
"from strauss.generator import Sampler\n",
"import os\n",
"from pathlib import Path\n",
- "%matplotlib inline"
+ "import strauss\n",
+ "%matplotlib inline\n",
+ "\n",
+ "mode = strauss.tts_caption.ttsMode"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "226f3af8-eea8-4f8e-b537-bda602e1418d",
+ "metadata": {},
+ "source": [
+ "What text to speech do we have?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ffe715e8-d5aa-487d-a125-0e17a6a01958",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Available text-to-speech (TTS) is: {mode}\")"
]
},
{
@@ -46,7 +69,6 @@
"outputs": [],
"source": [
"# platform agnostic absolute path for samples...\n",
- "import strauss\n",
"strauss_dir = Path(strauss.__file__).parents[2]\n",
"sample_path = Path(strauss_dir, 'data','samples','glockenspiels')\n",
"# setup used in stars appearing example\n",
@@ -74,12 +96,31 @@
"events.apply_mapping_functions(map_lims=maplims)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "ce448cfd-bd92-49d1-9c1d-c3c4d6252383",
+ "metadata": {},
+ "source": [
+ "Now, lets look at the avaialble voices for our TTS engine:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e50a986e-5c51-4d1a-aea5-99f3161cdd9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from strauss.tts_caption import TTS\n",
+ "voices = TTS().list_models()"
+ ]
+ },
{
"cell_type": "markdown",
"id": "b7d1566f-ff8c-4e21-8ceb-f743394fa4a5",
"metadata": {},
"source": [
- "Generate text-to-speech (TTS) for the caption, using the default choice of voice (`\"Jenny\"` from the `TTS` module)"
+ "Generate text-to-speech (TTS) for the caption, using the default choice of voice (`\"Jenny\"` for the `coqui-tts` module, OS default for `pyttsx3`)"
]
},
{
@@ -91,7 +132,6 @@
"source": [
"caption_en = 'In the following audio, a glockenspiel is used to represent stars of varying colour.'\n",
"\n",
- "# render at default 48 kHz rate\n",
"soni = Sonification(score, events, generator, system,\n",
" caption=caption_en)\n",
"soni.render()\n",
@@ -107,9 +147,22 @@
"source": [
"caption_en = 'In the following audio, a glockenspiel is used to represent stars of varying colour.'\n",
"\n",
- "soni = Sonification(score, events, generator, system,\n",
- " caption=caption_en,\n",
- " ttsmodel=str(Path('tts_models', 'en', 'ljspeech', 'tacotron2-DDC')))\n",
+ "if mode == 'coqui-tts':\n",
+ " soni = Sonification(score, events, generator, system,\n",
+ " caption=caption_en,\n",
+ " ttsmodel=str(Path('tts_models', 'en', 'ljspeech', 'tacotron2-DDC')))\n",
+ "elif mode == 'pyttsx3':\n",
+ " for v in voices[::-1]:\n",
+ " #print(v.languages[0][:2])\n",
+ " if v.languages[0][:2] == 'en':\n",
+ " break\n",
+ " print(f\"Selected voice: {v.name}\")\n",
+ " soni = Sonification(score, events, generator, system,\n",
+ " caption=caption_en,\n",
+ " ttsmodel={'voice':v.id,\n",
+ " # we can also set a rate for pyttsx3 (int16)...\n",
+ " 'rate': 217})\n",
+ "\n",
"soni.render()\n",
"soni.notebook_display(show_waveform=False)"
]
@@ -131,9 +184,19 @@
"source": [
"caption_de = \"In der folgenden Tonspur wird ein Glockenspiel verwendet um Sterne mit unterschiedlichen Farben zu repräsentieren.\"\n",
"\n",
- "soni = Sonification(score, events, generator, system,\n",
- " caption=caption_de, \n",
- " ttsmodel=str(Path('tts_models', 'de', 'thorsten', 'vits')))\n",
+ "if mode == 'coqui-tts':\n",
+ " soni = Sonification(score, events, generator, system,\n",
+ " caption=caption_de, \n",
+ " ttsmodel=str(Path('tts_models', 'de', 'thorsten', 'vits')))\n",
+ "elif mode == 'pyttsx3':\n",
+ " # find a German-language voice...\n",
+ " for v in voices:\n",
+ " if v.languages[0][:2] == 'de':\n",
+ " break\n",
+ " soni = Sonification(score, events, generator, system,\n",
+ " caption=caption_de,\n",
+ " ttsmodel={'voice':v.id})\n",
+ "\n",
"soni.render()\n",
"soni.notebook_display(show_waveform=False)"
]
@@ -143,7 +206,7 @@
"id": "ff8db018-02e3-48c2-a043-6ba132c1e239",
"metadata": {},
"source": [
- "**Note**: the AI-based `TTS` can behave strangely when using unrecognised characters or terms. Sometimes these will be mispronounced by the TTS, other times they could be skipped entirely. This can be circumvented by writing out the how symbols should be pronounced, or spelling phonetically to improve pronunciation:"
+ "**Note**: the AI-based `TTS` can behave unpredictably when using unrecognised characters or terms. Sometimes these will be mispronounced by the TTS, other times they could be skipped entirely. This can be circumvented by writing out the how symbols should be pronounced, or spelling phonetically to improve pronunciation:"
]
},
{
@@ -155,32 +218,25 @@
"source": [
"symbol_examples_en = 'The Lyman-α resonance is 1216 Å. The Lyman alpha resonance is twelve hundred and sixteen angstroms. '\n",
"\n",
+ "for v in voices[::-1]:\n",
+ " #print(v.languages[0][:2])\n",
+ " if v.languages[0][:2] == 'en':\n",
+ " break\n",
+ " \n",
"soni = Sonification(score, events, generator, system,\n",
- " caption=symbol_examples_en+caption_en)\n",
+ " caption=symbol_examples_en, ttsmodel={'voice':v.id, 'rate': 217})\n",
+ "\n",
"soni.render()\n",
"soni.notebook_display(show_waveform=0)"
]
},
- {
- "cell_type": "markdown",
- "id": "5a9db75d-6da4-4a9c-92d6-e31caee18e86",
- "metadata": {},
- "source": [
- "Captions can be used to provide context to sonifications, explaining what to listen for.\n",
- "\n",
- "We can list available models for the TTS module (including `Jenny` the default `strauss` voice):"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
- "id": "c216a84c-2fd4-46a0-abc1-a152bc77b639",
+ "id": "f706e822-d989-4b2a-b834-b1565548349d",
"metadata": {},
"outputs": [],
- "source": [
- "from strauss.tts_caption import TTS\n",
- "TTS().list_models()"
- ]
+ "source": []
}
],
"metadata": {
@@ -199,7 +255,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.9"
+ "version": "3.8.20"
}
},
"nbformat": 4,
diff --git a/examples/AudioCaption.py b/examples/AudioCaption.py
index fa389a7..8ee9fd1 100644
--- a/examples/AudioCaption.py
+++ b/examples/AudioCaption.py
@@ -3,6 +3,9 @@
# ### Generate a sonification with an audio caption in `strauss`
# Import the relevant modules:
+#
+# ***Note***: you will need to have some form of python text-to-speech installed (`TTS` or `pyttsx3`) for these examples to work. See the error raised when trying to run the examples below for more info:
+
from strauss.sonification import Sonification
from strauss.sources import Events
@@ -12,13 +15,20 @@
import numpy as np
from strauss.generator import Sampler
import os
-import pprint
from pathlib import Path
+import strauss
+
+mode = strauss.tts_caption.ttsMode
+
+
+# What text to speech do we have?
+print(f"Available text-to-speech (TTS) is: {mode}")
+
# Generate a placeholder sonification (a short sequence of glockenspiel notes) that we may want to add a caption to:
+
# platform agnostic absolute path for samples...
-import strauss
strauss_dir = Path(strauss.__file__).parents[2]
sample_path = Path(strauss_dir, 'data','samples','glockenspiels')
@@ -47,26 +57,36 @@
events.apply_mapping_functions(map_lims=maplims)
-# Generate text-to-speech (TTS) for the caption, using the default choice of voice (`"Jenny"` from the `TTS` module)
+# Now, lets look at the avaialble voices for our TTS engine:
+from strauss.tts_caption import TTS
+voices = TTS().list_models()
-caption_en = 'In the following audio, a glockenspiel is used to represent stars of varying colour.'
-print("Example of a caption using the default voice...")
+# Generate text-to-speech (TTS) for the caption, using the default choice of voice (`"Jenny"` for the `coqui-tts` module, OS default for `pyttsx3`)
+caption_en = 'In the following audio, a glockenspiel is used to represent stars of varying colour.'
-# render at default 48 kHz rate
soni = Sonification(score, events, generator, system,
caption=caption_en)
soni.render()
soni.hear()
-
caption_en = 'In the following audio, a glockenspiel is used to represent stars of varying colour.'
-print("Example of a caption using an alternative voice...")
+if mode == 'coqui-tts':
+ soni = Sonification(score, events, generator, system,
+ caption=caption_en,
+ ttsmodel=str(Path('tts_models', 'en', 'ljspeech', 'tacotron2-DDC')))
+elif mode == 'pyttsx3':
+ for v in voices[::-1]:
+ if v.languages[0][:2] == 'en':
+ break
+ print(f"Selected voice: {v.name}")
+ soni = Sonification(score, events, generator, system,
+ caption=caption_en,
+ ttsmodel={'voice':v.id,
+ # we can also set a rate for pyttsx3 (int16)...
+ 'rate': 217})
-soni = Sonification(score, events, generator, system,
- caption=caption_en,
- ttsmodel=Path('tts_models', 'en', 'ljspeech', 'tacotron2-DDC'))
soni.render()
soni.hear()
@@ -75,31 +95,34 @@
caption_de = "In der folgenden Tonspur wird ein Glockenspiel verwendet um Sterne mit unterschiedlichen Farben zu repräsentieren."
-print("Example of a caption in a different language (German), selecting a voice supportingh that language ('Thorsten')...")
+if mode == 'coqui-tts':
+ soni = Sonification(score, events, generator, system,
+ caption=caption_de,
+ ttsmodel=str(Path('tts_models', 'de', 'thorsten', 'vits')))
+elif mode == 'pyttsx3':
+ # find a German-language voice...
+ for v in voices:
+ if v.languages[0][:2] == 'de':
+ break
+ soni = Sonification(score, events, generator, system,
+ caption=caption_de,
+ ttsmodel={'voice':v.id})
-soni = Sonification(score, events, generator, system,
- caption=caption_de,
- ttsmodel=Path('tts_models', 'de', 'thorsten', 'vits'))
soni.render()
soni.hear()
-# **Note**: the AI-based `TTS` can behave strangely when using unrecognised characters or terms. Sometimes these will be mispronounced by the TTS, other times they could be skipped entirely. This can be circumvented by writing out the how symbols should be pronounced, or spelling phonetically to improve pronunciation:
+# **Note**: the AI-based `TTS` can behave unpredictably when using unrecognised characters or terms. Sometimes these will be mispronounced by the TTS, other times they could be skipped entirely. This can be circumvented by writing out the how symbols should be pronounced, or spelling phonetically to improve pronunciation:
symbol_examples_en = 'The Lyman-α resonance is 1216 Å. The Lyman alpha resonance is twelve hundred and sixteen angstroms. '
-print("Example of mispronunciation of terms or symbols...")
-
+for v in voices[::-1]:
+ if v.languages[0][:2] == 'en':
+ break
+
soni = Sonification(score, events, generator, system,
- caption=symbol_examples_en+caption_en)
+ caption=symbol_examples_en, ttsmodel={'voice':v.id, 'rate': 217})
+
soni.render()
soni.hear()
-
-# Captions can be used to provide context to sonifications, explaining what to listen for.
-#
-# We can list available models for the TTS module (including `Jenny` the default `strauss` voice):
-
-print("Print available voice models...")
-from strauss.tts_caption import TTS
-pprint.pprint(TTS().list_models().list_tts_models())
diff --git a/pyproject.toml b/pyproject.toml
index 5ccbad9..61f67fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ requires = [
"sf2utils",
"tqdm",
"wavio",
- "wheel"
+ "wheel",
+ "pyttsx3 @ git+https://github.com/nateshmbhat/pyttsx3.git"
]
build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 557cfd4..f57b6d7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,3 +39,4 @@ where = src
[options.extras_require]
TTS =
TTS
+ pyttsx3
diff --git a/src/strauss/sonification.py b/src/strauss/sonification.py
index ce17480..04c8a03 100644
--- a/src/strauss/sonification.py
+++ b/src/strauss/sonification.py
@@ -16,7 +16,7 @@
from .stream import Stream
from .channels import audio_channels
from .utilities import const_or_evo, nested_dict_idx_reassign, NoSoundDevice
-from .tts_caption import render_caption
+from .tts_caption import render_caption, get_ttsMode
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
@@ -65,13 +65,13 @@ class Sonification:
"""
def __init__(self, score, sources, generator, audio_setup='stereo',
caption=None, samprate=48000,
- ttsmodel=Path('tts_models','en','jenny', 'jenny')):
+ ttsmodel=None):
# sampling rate in Hz
self.samprate = samprate
# tts model name
- self.ttsmodel = str(ttsmodel)
+ self.ttsmodel = ttsmodel
# caption
self.caption = caption
@@ -185,11 +185,25 @@ def render(self, downsamp=1):
# produce mono audio of caption, if one is provided
if str(self.caption or '').strip():
+ ttsMode = get_ttsMode() # determine if using coqui-ai or pyttsx3
+
# use a temporary directory to ensure caption file cleanup
with tempfile.TemporaryDirectory() as cdir:
cpath = Path(cdir, 'caption.wav')
- render_caption(self.caption, self.samprate,
- self.ttsmodel, cpath)
+ if ttsMode == 'coqui-ai':
+ if self.ttsmodel == None:
+ self.ttsmodel = Path('tts_models','en','jenny', 'jenny')
+ else:
+ pass
+ render_caption(self.caption, self.samprate,
+ str(self.ttsmodel), cpath)
+ else:
+ if self.ttsmodel == None:
+ self.ttsmodel = {}
+ else:
+ pass
+ render_caption(self.caption, self.samprate,
+ self.ttsmodel, str(cpath))
rate_in, wavobj = wavfile.read(cpath)
wavobj = np.array(wavobj)
# Set up the Stream objects for TTS
diff --git a/src/strauss/tts_caption.py b/src/strauss/tts_caption.py
index 8288228..d96c827 100644
--- a/src/strauss/tts_caption.py
+++ b/src/strauss/tts_caption.py
@@ -3,51 +3,145 @@
import numpy as np
import strauss.utilities as utils
import re
+import ffmpeg as ff
+import os
+import warnings
+class NoTTSAPI(Exception):
+ # except when no API key is found for coqui-TTS module
+ pass
try:
from TTS.api import TTS
-except (OSError, ModuleNotFoundError) as sderr:
- def TTS(*args, **kwargs):
- raise TTSIsNotSupported("strauss has not been installed with text-to-speech support. \n"
- "This is not installed by default, due to some specific module requirements of the TTS module."
- "Reinstalling strauss with 'pip install strauss[TTS]' will give you access to this function")
+ ttsMode = 'coqui-TTS'
+ if not os.environ.get("COQUI_STUDIO_TOKEN"):
+ raise NoTTSAPI
+except (OSError, ModuleNotFoundError, NoTTSAPI) as sderr:
+ try:
+ import pyttsx3
+ ttsMode = 'pyttsx3'
+ class TTS:
+ def __init__(*args, **kwargs):
+ pass
+ def list_models(self):
+ return getVoices(True)
+ warnings.warn("Default TTS module coqui not found, using pyttsx3 instead. Note this is platform \n"
+ "dependent and still problematic for linux-based systems (using the espeak engine)")
+ except (OSError, ModuleNotFoundError) as sderr:
+ ttsMode = 'None'
+ # print('No supported text-to-speech packages have been found.')
+ def TTS(*args, **kwargs):
+ raise TTSIsNotSupported("strauss has not been installed with text-to-speech support. \n"
+ "This is not installed by default, due to some specific module requirements of the TTS module.\n"
+ "Reinstalling strauss with 'pip install strauss[TTS]' will give you access to this function\n"
+ "If you run into issues with the TTS package, you can also install pyttsx3. Currently the most\n"
+ "compatible version is npt published on PyPI, but you can install from the git repo with \n"
+ "'pip install git+https://github.com/nateshmbhat/pyttsx3.git'.")
class TTSIsNotSupported(Exception):
pass
+def get_ttsMode():
+ return ttsMode
+
+def getVoices(info=False):
+ '''Get available voices for text-to-speech.
+
+ When info=True, this prints out information
+ for each voice option.
+
+ Args:
+ info (:obj:`bool`): Print out voice information when True,
+ by default False
+ voices (:obj:`list`): List of ``pyttsx3.voice.Voice`` objects
+ '''
+ if ttsMode == 'pyttsx3':
+ engine = pyttsx3.init()
+ voices = engine.getProperty('voices')
+ if info==True:
+ print('Text-to-speech voice options')
+ for ind in range(len(voices)):
+ voiceProps = vars(voices[ind])
+ print('\nVoice index:', ind)
+ for key in voiceProps.keys():
+ print('{}: {}'.format(key, voiceProps[key]))
+ else:
+ pass
+ return voices
+
def render_caption(caption, samprate, model, caption_path):
'''The render_caption function generates an audio caption from text input
and writes it as a wav file. If the sample rate of the model is not equal
to that passed from sonification.py, it resamples to the correct rate and
- re-writes the file. Text from user input is converted with text-to-speech
- software from Coqui-AI - https://pypi.org/project/TTS/ . You can view
- publicly available voice models with 'TTS.list_models()'
+ re-writes the file.
+
+ If Coqui-AI is installed, text from user input is converted with text-to-
+ speech software from Coqui-AI - https://pypi.org/project/TTS/ .
+ You can view publicly available voice models with 'TTS.list_models()'
+
+ If Coqui-AI is not installed but pyttsx3 (https://pypi.org/project/pyttsx3/)
+ is installed, text from user input is converted offline using pyttsx3.
+
+ Note:
+ STRAUSS checks if Coqui-AI is available. If it is, ``ttsMode`` is set to
+ ``coqui-ai``. If it is unavailable, STRAUSS checks whether pyttsx3 is
+ available. If it is, ``ttsMode`` is set to ``pyttsx3``.
Args:
caption (:obj:`str`): script to be spoken by the TTS voice
samprate (:obj:`int`): samples per second
- model (:obj:`str`): valid name of TTS voice from the underying TTS
- module
- model (:obj:`str`): valid name of TTS voice from the underying TTS
- module
+ model (:obj:`str` for Coqui-AI; :obj:`dict` for pyttsx3): for Coqui-AI:
+ valid name of TTS voice from the underlying TTS module; for pyttsx3:
+ dictionary with keys of 'rate' (percent of speed, signed int16),
+ 'volume' (float from 0 to 1), and/or 'voice' (the voice 'id' that can
+ be chosen from the list given by the TTS.list_models() function).
caption_path (:obj:`str`): filepath for spoken caption output
'''
- # TODO: do this better with logging. We can filter TTS function output, e.g. alert to downloading models...
- print('Rendering caption (this can take a while if the caption is long, or if the TTS model needs downloading)...')
+ if ttsMode == 'coqui-TTS':
+ # TODO: do this better with logging. We can filter TTS function output, e.g. alert to downloading models...
+ print('Rendering caption (this can take a while if the caption is long, or if the TTS model needs downloading)...')
+
+ # capture stdout from the talkative TTS module
+ with utils.Capturing() as output:
+ # Load in the tts model
+ tts = TTS(model, progress_bar=False, gpu=False)
+
+ # render to speech, and write as a wav file (allow )
+ tts.tts_to_file(text=caption, file_path=caption_path)
- # capture stdout from the talkative TTS module
- with utils.Capturing() as output:
- # Load in the tts model
- tts = TTS(model, progress_bar=False, gpu=False)
+ elif ttsMode == 'pyttsx3':
- # render to speech, and write as a wav file (allow )
- tts.tts_to_file(text=caption, file_path=caption_path)
+ # Setup voice model for pyttsx3
+ engine = pyttsx3.init() # initialize object
-
+ # check what model info was set; if none were
+ # specified, use defaults
+ for key in ['rate','volume','voice']:
+ if key in model.keys():
+ engine.setProperty(key, model[key])
+ else:
+ pass
+
+ engine.save_to_file(caption, caption_path, name='caption')
+ # note the current PyPI release ()
+ engine.runAndWait()
+
+ else:
+ # initialise dummy TTS class to raise error.
+ TTS()
+
# Read the file back in to check the sample rate
- rate_in, wavobj = wavfile.read(caption_path)
-
- #If it doesn't match the required rate, resample and re-write
+ try:
+ # Try to read in directly...
+ rate_in, wavobj = wavfile.read(caption_path)
+ except:
+ # ...but pttsx3 TTS can produce audio files incompatable
+ # with scipy - convert to standard WAV using ffmpeg
+ cpre = caption_path.split('.')[0] + '_pre.wav'
+ os.rename(caption_path, cpre)
+ ff.input(cpre).output(caption_path).run(quiet=1)
+ rate_in, wavobj = wavfile.read(caption_path)
+
+ # If it doesn't match the required rate, resample and re-write
if rate_in != samprate:
new_wavobj = utils.resample(rate_in, samprate, wavobj)
wavfile.write(caption_path, samprate, new_wavobj)