address #28

lucidrains · Aug 27, 2024 · b39fdbe · b39fdbe
1 parent e0326d0
commit b39fdbe
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 5 deletions.
diff --git a/e2_tts_pytorch/e2_tts.py b/e2_tts_pytorch/e2_tts.py
@@ -248,22 +248,24 @@ def __init__(
         cond_audio_to_text = True
     ):
         super().__init__()
-        self.text_to_audio = nn.Linear(dim_text, dim, bias = False)
+        self.text_to_audio = nn.Linear(dim_text + dim, dim, bias = False)
         nn.init.zeros_(self.text_to_audio.weight)
 
         self.cond_audio_to_text = cond_audio_to_text
 
         if cond_audio_to_text:
-            self.audio_to_text = nn.Linear(dim, dim_text, bias = False)
+            self.audio_to_text = nn.Linear(dim + dim_text, dim_text, bias = False)
             nn.init.zeros_(self.audio_to_text.weight)
 
     def forward(
         self,
         audio: Float['b n d'],
         text: Float['b n dt']
     ):
-        text_cond = self.text_to_audio(text)
-        audio_cond = self.audio_to_text(audio) if self.cond_audio_to_text else 0.
+        audio_text, _ = pack((audio, text), 'b n *')
+
+        text_cond = self.text_to_audio(audio_text)
+        audio_cond = self.audio_to_text(audio_text) if self.cond_audio_to_text else 0.
 
         return audio + text_cond, text + audio_cond
 
@@ -742,6 +744,10 @@ def transformer_with_pred_head(
         x = self.proj_in(x)
         cond = self.cond_proj_in(cond)
 
+        # add the condition, given as using voicebox-like scheme
+
+        x = x + cond
+
         # whether to use a text embedding
 
         text_embed = None

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "e2-tts-pytorch"
-version = "0.6.2"
+version = "0.6.3"
 description = "E2-TTS in Pytorch"
 authors = [
     { name = "Phil Wang", email = "[email protected]" }