Add support for Youtube videos

dhruvbaldawa · Dec 15, 2024 · 3261755 · 3261755
1 parent 15c9325
commit 3261755
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 21 deletions.
diff --git a/gyandex/cli/podgen.py b/gyandex/cli/podgen.py
@@ -79,7 +79,7 @@ def main():
             audio_file_path=podcast_path,
             metadata=PodcastMetadata(
                 title=script.title,
-                description=script.description,
+                description=script.description + f"\n\nSource: {config.content.source}",
             ),
         )
     console.print(f"Feed published at {urls['feed_url']}")

diff --git a/gyandex/loaders/factory.py b/gyandex/loaders/factory.py
@@ -1,22 +1,16 @@
-from typing import Any, Dict, Optional
-
 import requests
-from pydantic import BaseModel
 
 from ..podgen.config.schema import ContentConfig, ContentFormat  # @TODO: Pull this out of podgen
-
-
-# @TODO: pull this out of this file
-class Document(BaseModel):
-    title: Optional[str] = None
-    metadata: Optional[Dict[str, Any]] = None
-    content: str
+from .types import Document
+from .youtube import fetch_youtube
 
 
 def load_content(content_config: ContentConfig) -> Document:
-    if content_config.format != ContentFormat.HTML:
-        raise NotImplementedError(f"Unsupported content format: {content_config.format}")
-    return fetch_url(content_config.source)
+    if content_config.format == ContentFormat.HTML:
+        return fetch_url(content_config.source)
+    elif content_config.format == ContentFormat.YOUTUBE:
+        return fetch_youtube(content_config.source)
+    raise NotImplementedError(f"Unsupported content format: {content_config.format}")
 
 
 def fetch_url(url) -> Document:

diff --git a/gyandex/loaders/types.py b/gyandex/loaders/types.py
@@ -0,0 +1,9 @@
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel
+
+
+class Document(BaseModel):
+    title: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    content: str
diff --git a/gyandex/loaders/youtube.py b/gyandex/loaders/youtube.py
@@ -0,0 +1,42 @@
+import xml.etree.ElementTree as ET
+from urllib.parse import unquote_plus
+
+import requests
+import yt_dlp
+
+from .types import Document
+
+
+def fetch_youtube(url: str) -> Document:
+    """
+    Fetch a YouTube video transcript
+    :param url:
+    :return:
+    """
+    ydl_opts = {
+        "writesubtitles": True,
+        "writeautomaticsub": True,
+        "subtitleslangs": ["en"],
+        "subtitlesformat": "srv1",
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+        if not info:
+            raise ValueError("Could not extract video information")
+
+        r = requests.get(info["requested_subtitles"]["en"]["url"])
+
+        # Parse XML
+        root = ET.fromstring(r.text)
+        # Extract and decode text from each <text> element
+        texts = [unquote_plus(text.text or "") for text in root.findall("text")]
+
+        # Join all texts with space
+        return Document(
+            title=info.get("title", ""),
+            content=" ".join(texts),
+            metadata={
+                "url": url,
+                "description": info.get("description", ""),
+            },
+        )
diff --git a/gyandex/podgen/config/schema.py b/gyandex/podgen/config/schema.py
@@ -7,7 +7,7 @@
 # @TODO: Redo this, the content format can be better structured
 class ContentFormat(Enum):
     HTML = "html"
-    MARKDOWN = "markdown"
+    YOUTUBE = "youtube"
     PDF = "pdf"
     TEXT = "text"
 

diff --git a/gyandex/podgen/workflows/alexandria.py b/gyandex/podgen/workflows/alexandria.py
@@ -1,13 +1,15 @@
 import asyncio
+from json import JSONDecodeError
 from textwrap import dedent
 from typing import List
 
 from langchain.output_parsers import PydanticOutputParser
 from langchain.prompts import PromptTemplate
+from langchain_core.exceptions import OutputParserException
 from rich import print as rprint
 
 from ...llms.factory import get_model
-from ...loaders.factory import Document
+from ...loaders.types import Document
 from ..config.schema import LLMConfig, Participant, PodcastConfig
 from .types import OutlineSegment, PodcastEpisode, PodcastOutline, ScriptSegment
 
@@ -47,7 +49,14 @@ def __init__(self, config: LLMConfig):
 
     def generate_outline(self, document: Document) -> PodcastOutline:
         """Generate structured podcast outline from content summary"""
-        chain = self.outline_prompt | self.model | self.parser
+        chain = (
+            self.outline_prompt
+            | self.model
+            | self.parser.with_retry(
+                stop_after_attempt=2,
+                retry_if_exception_type=(JSONDecodeError, OutputParserException),
+            )
+        )
         response = chain.invoke({"content": document.content, "title": document.title})
         return response
 
@@ -112,7 +121,14 @@ def __init__(self, config: LLMConfig, participants: List[Participant]):
             """),
         )
 
-        self.chain = self.segment_prompt | self.model | self.parser
+        self.chain = (
+            self.segment_prompt
+            | self.model
+            | self.parser.with_retry(
+                stop_after_attempt=2,
+                retry_if_exception_type=(JSONDecodeError, OutputParserException),
+            )
+        )
 
     def create_host_profile(self, participant: Participant):
         return f"HOST ({participant.name})[{participant.gender}]: {participant.personality}"

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 podgen = "gyandex.cli.genpod:main"
 
 [tool.poetry.dependencies]
-python = "^3.11"
+python = "^3.11,<3.14"
 pandas = "^2.2.3"
 notebook = "^7.2.2"
 torch = "^2.5.0"
@@ -26,6 +26,7 @@ mutagen = "^1.47.0"
 rich = {extras = ["jupyter"], version = "^13.9.3"}
 python-slugify = "^8.0.4"
 langchain-openai = "^0.2.12"
+yt-dlp = "^2024.12.13"
 
 [tool.poetry.group.dev.dependencies]
 nbstripout = "^0.7.1"