james-trayford · soleyhyman · Oct 29, 2024 · Nov 3, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/src/strauss/sonification.py b/src/strauss/sonification.py
@@ -16,7 +16,7 @@
 from .stream import Stream
 from .channels import audio_channels
 from .utilities import const_or_evo, nested_dict_idx_reassign, NoSoundDevice
-from .tts_caption import render_caption
+from .tts_caption import render_caption, get_ttsMode
 import numpy as np
 import matplotlib.pyplot as plt
 from tqdm import tqdm
@@ -65,13 +65,13 @@ class Sonification:
     """
     def __init__(self, score, sources, generator, audio_setup='stereo',
                  caption=None, samprate=48000,
-                 ttsmodel=Path('tts_models','en','jenny', 'jenny')):
+                 ttsmodel=None):
 
         # sampling rate in Hz
         self.samprate = samprate
 
         # tts model name
-        self.ttsmodel = str(ttsmodel)
+        self.ttsmodel = ttsmodel
 
         # caption
         self.caption = caption
@@ -185,11 +185,25 @@ def render(self, downsamp=1):
 
         # produce mono audio of caption, if one is provided
         if str(self.caption or '').strip():
+            ttsMode = get_ttsMode() # determine if using coqui-ai or pyttsx3
+
             # use a temporary directory to ensure caption file cleanup
             with tempfile.TemporaryDirectory() as cdir:
                 cpath = Path(cdir, 'caption.wav')
-                render_caption(self.caption, self.samprate,
-                               self.ttsmodel, cpath)
+                if ttsMode == 'coqui-ai':
+                    if self.ttsmodel == None:
+                        self.ttsmodel = Path('tts_models','en','jenny', 'jenny')
+                    else:
+                        pass
+                    render_caption(self.caption, self.samprate,
+                               str(self.ttsmodel), cpath)
+                else:
+                    if self.ttsmodel == None:
+                        self.ttsmodel = {}
+                    else:
+                        pass
+                    render_caption(self.caption, self.samprate,
+                               self.ttsmodel, str(cpath))
                 rate_in, wavobj = wavfile.read(cpath)
                 wavobj = np.array(wavobj)
             # Set up the Stream objects for TTS

diff --git a/src/strauss/tts_caption.py b/src/strauss/tts_caption.py
@@ -3,53 +3,125 @@
 import numpy as np
 import strauss.utilities as utils
 import re
+
 try:
     from TTS.api import TTS
+    ttsMode = 'coqui-TTS'
 except (OSError, ModuleNotFoundError) as sderr:
-    def TTS(*args, **kwargs):
-        raise TTSIsNotSupported("strauss has not been installed with text-to-speech support. \n"
-              "This is not installed by default, due to some specific module requirements of the TTS module."
-              "Reinstalling strauss with 'pip install strauss[TTS]' will give you access to this function")
-
+    print('Coqui TTS not found. Trying to import pyttsx3...')
+    try:
+      import pyttsx3
+      ttsMode = 'pyttsx3'
+      print('pyttsx3 has been successfully imported.')
+    except (OSError, ModuleNotFoundError) as sderr:
+      ttsMode = 'None'
+      print('No supported text-to-speech packages have been found.')
+      def TTS(*args, **kwargs):
+          raise TTSIsNotSupported("strauss has not been installed with text-to-speech support. \n"
+                "This is not installed by default, due to some specific module requirements of the TTS module.\n"
+                "Reinstalling strauss with 'pip install strauss[TTS]' will give you access to this function\n"
+                "If you run into issues with the TTS package, you can also install pyttsx3 with the command\n" 
+                "'pip install pyttsx3'.")
+
 class TTSIsNotSupported(Exception):
     pass
 
+def get_ttsMode():
+   return ttsMode
+
+def getVoices(info=False):
+  '''Get available voices for text-to-speech.
+
+  When info=True, this prints out information
+  for each voice option.
+
+    Args:
+      info (:obj:`bool`): Print out voice information when True, 
+      by default False
+      voices (:obj:`list`): List of ``pyttsx3.voice.Voice`` objects
+  '''
+  if ttsMode == 'pyttsx3':
+    engine = pyttsx3.init()
+    voices = engine.getProperty('voices')
+    if info==True:
+        print('Text-to-speech voice options')
+        for ind in range(len(voices)):
+            voiceProps = vars(voices[ind])
+            print('\nVoice index:', ind)
+            for key in voiceProps.keys():
+                print('{}: {}'.format(key, voiceProps[key]))
+    else:
+        pass
+    return voices
+
 def render_caption(caption, samprate, model, caption_path):
     '''The render_caption function generates an audio caption from text input
     and writes it as a wav file. If the sample rate of the model is not equal 
     to that passed from sonification.py, it resamples to the correct rate and
-    re-writes the file. Text from user input is converted with text-to-speech
-    software from Coqui-AI - https://pypi.org/project/TTS/ . You can view 
-    publicly available voice models with 'TTS.list_models()'
+    re-writes the file. 
+
+    If Coqui-AI is installed, text from user input is converted with text-to-
+    speech software from Coqui-AI - https://pypi.org/project/TTS/ . 
+    You can view publicly available voice models with 'TTS.list_models()'
+
+    If Coqui-AI is not installed but pyttsx3 (https://pypi.org/project/pyttsx3/)
+    is installed, text from user input is converted offline using pyttsx3.
+
+    Note:
+    STRAUSS checks if Coqui-AI is available. If it is, ``ttsMode`` is set to
+    ``coqui-ai``. If it is unavailable, STRAUSS checks whether pyttsx3 is 
+    available. If it is, ``ttsMode`` is set to ``pyttsx3``.
 
     Args:
       caption (:obj:`str`): script to be spoken by the TTS voice
       samprate (:obj:`int`): samples per second
-      model (:obj:`str`): valid name of TTS voice from the underying TTS
-        module
-      model (:obj:`str`): valid name of TTS voice from the underying TTS
-        module
+      model (:obj:`str` for Coqui-AI; :obj:`dict` for pyttsx3): for Coqui-AI: 
+        valid name of TTS voice from the underlying TTS module; for pyttsx3:
+        dictionary with keys of 'rate' (percent of speed), 'volume' (float from 0 to 1), 
+        and/or 'voices' ()
       caption_path (:obj:`str`): filepath for spoken caption output
     '''
 
-    # TODO: do this better with logging. We can filter TTS function output, e.g. alert to downloading models...
-    print('Rendering caption (this can take a while if the caption is long, or if the TTS model needs downloading)...')
+    if ttsMode == 'coqui-TTS':
+      # TODO: do this better with logging. We can filter TTS function output, e.g. alert to downloading models...
+      print('Rendering caption (this can take a while if the caption is long, or if the TTS model needs downloading)...')
+
+      # capture stdout from the talkative TTS module
+      with utils.Capturing() as output:
+          # Load in the tts model
+          tts = TTS(model, progress_bar=False, gpu=False)
+
+          # render to speech, and write as a wav file (allow )
+          tts.tts_to_file(text=caption, file_path=caption_path)
 
-    # capture stdout from the talkative TTS module
-    with utils.Capturing() as output:
-        # Load in the tts model
-        tts = TTS(model, progress_bar=False, gpu=False)
+    elif ttsMode == 'pyttsx3':
+      print('Rendering caption (this can take a while if the caption is long)...')
+
+      # capture stdout from the talkative TTS module
+      with utils.Capturing() as output:
+          # Setup voice model for pyttsx3
+          engine = pyttsx3.init() # initialize object
 
-        # render to speech, and write as a wav file (allow )
-        tts.tts_to_file(text=caption, file_path=caption_path)
+          # check what model info was set; if none were
+          # specified, use defaults
+          for key in ['rate','volume','voices']:
+              if key in model.keys():
+                engine.setProperty(key, model[key])
+          else:
+             pass
 
-
+          # render to speech, and write as a wav file (allow )
+          engine.save_to_file(text=caption, filename=caption_path)
+          engine.runAndWait()
+
     # Read the file back in to check the sample rate
     rate_in, wavobj = wavfile.read(caption_path)
 
     #If it doesn't match the required rate, resample and re-write
     if rate_in != samprate:
         new_wavobj = utils.resample(rate_in, samprate, wavobj)
         wavfile.write(caption_path, samprate, new_wavobj)
+    else:
+       TTS()