From 396413c5497c69e99c6a2018476b9e758793ce33 Mon Sep 17 00:00:00 2001
From: Christopher Aedo <doc@aedo.net>
Date: Mon, 20 Nov 2023 12:17:31 -0800
Subject: [PATCH 1/2] first test with xtts

---
 epub2tts.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/epub2tts.py b/epub2tts.py
index 5dc6354..13ad484 100644
--- a/epub2tts.py
+++ b/epub2tts.py
@@ -44,7 +44,6 @@
     device = "cpu"
 print(f"Using device: {device}")
 
-model_name = "tts_models/en/vctk/vits"
 blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script']
 ffmetadatafile = "FFMETADATAFILE"
 
@@ -244,6 +243,12 @@ def combine_sentences(sentences, length=3500):
     yield combined
 
 def main():
+    if "--xtts" in sys.argv:
+        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+        index = sys.argv.index("--xtts")
+        speaker_wav = sys.argv[index + 1]
+    else:
+        model_name = "tts_models/en/vctk/vits"
     bookname = get_bookname() #detect .txt, .epub or https
     booktype = bookname.split('.')[-1]
     speaker_used = get_speaker()
@@ -318,7 +323,26 @@ def main():
                 for f in tempfiles:
                     os.remove(f)
             else:
-                tts.tts_to_file(text = chapters_to_read[i], speaker = speaker_used, file_path = outputwav)
+                if "--xtts" in sys.argv:
+
+                    tempfiles = []
+                    segmenter = pysbd.Segmenter(language="en", clean=True)
+                    sentences = segmenter.segment(chapters_to_read[i])
+                    sentence_groups = list(combine_sentences(sentences), 250)
+                    for x in range(len(sentence_groups)):
+                        tempwav = "temp" + str(x) + ".wav"
+                        print(sentence_groups[x])
+                        tts.tts_to_file(text=sentence_groups[x], speaker_wav = speaker_wav, file_path=tempwav, language="en")
+                        tempfiles.append(tempwav)
+                    tempwavfiles = [AudioSegment.from_mp3(f"{f}") for f in tempfiles]
+                    concatenated = sum(tempwavfiles)
+                    concatenated.export(outputwav, format="wav")
+#                    for f in tempfiles:
+#                        os.remove(f)
+
+                else:
+                    tts.tts_to_file(text = chapters_to_read[i], speaker = speaker_used, file_path = outputwav)
+                
 
         files.append(outputwav)
         position += len(chapters_to_read[i])

From 70effba8990ed063c6d04dc30dacb3d5a93194c5 Mon Sep 17 00:00:00 2001
From: Christopher Aedo <doc@aedo.net>
Date: Mon, 20 Nov 2023 20:50:07 -0800
Subject: [PATCH 2/2] Working great, needs cleanup though

---
 README.md   |  2 ++
 epub2tts.py | 23 ++++++++++++-----------
 setup.py    |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index ddb7c1b..630156c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Usage:
 
   URL:  `epub2tts --url https://www.example.com/page --name example-page`
 
+To use Coqui XTTS, add: `--xtts <sample.wav>` (GPU absolutely required, and even then it's slow but sounds amazing!)
+
 To use OpenAI TTS, add: `--openai <your API key>` (Use speaker option to specify voice other than onyx: `--speaker shimmer`)
 
 To change speaker (ex p307 for a good male voice), add: `--speaker p307`
diff --git a/epub2tts.py b/epub2tts.py
index 13ad484..64a6d2d 100644
--- a/epub2tts.py
+++ b/epub2tts.py
@@ -57,6 +57,8 @@
 helpful for finding which chapter to start and end on if you want to
 skip TOC, bibliography, etc.
 
+To use Coqui XTTS, add: --xtts <sample.wav> (GPU absolutely required, and even then it's slow but sounds amazing!)
+To use OpenAI TTS, add: --openai <your API key> (Use speaker option to specify voice other than onyx: `--speaker shimmer`)
 To change speaker (ex p307 for a good male voice), add: --speaker p307
 To output in mp3 format instead of m4b, add: --mp3
 To skip reading any links, add: --skip-links
@@ -135,10 +137,11 @@ def get_speaker():
     if "--speaker" in sys.argv:
         index = sys.argv.index("--speaker")
         speaker_used = sys.argv[index + 1]    
-    else:
-        if "--openai" in sys.argv:
+    elif "--openai" in sys.argv:
             speaker_used = "onyx"
-        else:
+    elif "--xtts" in sys.argv:
+            speaker_used = "xtts"
+    else:
             speaker_used = "p335"
     print(f"Speaker: {speaker_used}")
     return(speaker_used)
@@ -162,9 +165,8 @@ def get_chapters_epub(book, bookname):
     for i in range(len(chapters)):
         #strip some characters that might have caused TTS to choke
         text = chap2text(chapters[i])
-        #this still misses a lot of special characters...
-        #text = text.translate({ord(c): None for c in '[]*“”"\''})
-        allowed_chars = string.ascii_letters + string.digits + '-,.!? '
+        text = text.replace("—", ", ")
+        allowed_chars = string.ascii_letters + string.digits + "-,.!? '"
         text = ''.join(c for c in text if c in allowed_chars)
         if len(text) < 150:
             #too short to bother with
@@ -324,21 +326,20 @@ def main():
                     os.remove(f)
             else:
                 if "--xtts" in sys.argv:
-
+#look at all this disgusting duplicated code! FIX IT!!!
                     tempfiles = []
                     segmenter = pysbd.Segmenter(language="en", clean=True)
                     sentences = segmenter.segment(chapters_to_read[i])
-                    sentence_groups = list(combine_sentences(sentences), 250)
+                    sentence_groups = list(combine_sentences(sentences, 1000))
                     for x in range(len(sentence_groups)):
                         tempwav = "temp" + str(x) + ".wav"
-                        print(sentence_groups[x])
                         tts.tts_to_file(text=sentence_groups[x], speaker_wav = speaker_wav, file_path=tempwav, language="en")
                         tempfiles.append(tempwav)
                     tempwavfiles = [AudioSegment.from_mp3(f"{f}") for f in tempfiles]
                     concatenated = sum(tempwavfiles)
                     concatenated.export(outputwav, format="wav")
-#                    for f in tempfiles:
-#                        os.remove(f)
+                    for f in tempfiles:
+                        os.remove(f)
 
                 else:
                     tts.tts_to_file(text = chapters_to_read[i], speaker = speaker_used, file_path = outputwav)
diff --git a/setup.py b/setup.py
index 65a7405..2d6c0d0 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     author_email='doc@aedo.net',
     url='https://github.com/aedocw/epub2tts',
     license='Apache License, Version 2.0',
-    version='1.4.0',
+    version='1.5.0',
     packages=find_packages(),
     install_requires=requirements,
     py_modules=['epub2tts'],