Skip to content

Commit

Permalink
Add support for Youtube videos
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvbaldawa committed Dec 15, 2024
1 parent 15c9325 commit 3261755
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 21 deletions.
2 changes: 1 addition & 1 deletion gyandex/cli/podgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def main():
audio_file_path=podcast_path,
metadata=PodcastMetadata(
title=script.title,
description=script.description,
description=script.description + f"\n\nSource: {config.content.source}",
),
)
console.print(f"Feed published at {urls['feed_url']}")
Expand Down
20 changes: 7 additions & 13 deletions gyandex/loaders/factory.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
from typing import Any, Dict, Optional

import requests
from pydantic import BaseModel

from ..podgen.config.schema import ContentConfig, ContentFormat # @TODO: Pull this out of podgen


# @TODO: pull this out of this file
class Document(BaseModel):
title: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
content: str
from .types import Document
from .youtube import fetch_youtube


def load_content(content_config: ContentConfig) -> Document:
if content_config.format != ContentFormat.HTML:
raise NotImplementedError(f"Unsupported content format: {content_config.format}")
return fetch_url(content_config.source)
if content_config.format == ContentFormat.HTML:
return fetch_url(content_config.source)
elif content_config.format == ContentFormat.YOUTUBE:
return fetch_youtube(content_config.source)
raise NotImplementedError(f"Unsupported content format: {content_config.format}")


def fetch_url(url) -> Document:
Expand Down
9 changes: 9 additions & 0 deletions gyandex/loaders/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import Any, Dict, Optional

from pydantic import BaseModel


class Document(BaseModel):
title: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
content: str
42 changes: 42 additions & 0 deletions gyandex/loaders/youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import xml.etree.ElementTree as ET
from urllib.parse import unquote_plus

import requests
import yt_dlp

from .types import Document


def fetch_youtube(url: str) -> Document:
"""
Fetch a YouTube video transcript
:param url:
:return:
"""
ydl_opts = {
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["en"],
"subtitlesformat": "srv1",
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
if not info:
raise ValueError("Could not extract video information")

r = requests.get(info["requested_subtitles"]["en"]["url"])

# Parse XML
root = ET.fromstring(r.text)
# Extract and decode text from each <text> element
texts = [unquote_plus(text.text or "") for text in root.findall("text")]

# Join all texts with space
return Document(
title=info.get("title", ""),
content=" ".join(texts),
metadata={
"url": url,
"description": info.get("description", ""),
},
)
2 changes: 1 addition & 1 deletion gyandex/podgen/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# @TODO: Redo this, the content format can be better structured
class ContentFormat(Enum):
HTML = "html"
MARKDOWN = "markdown"
YOUTUBE = "youtube"
PDF = "pdf"
TEXT = "text"

Expand Down
22 changes: 19 additions & 3 deletions gyandex/podgen/workflows/alexandria.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import asyncio
from json import JSONDecodeError
from textwrap import dedent
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.exceptions import OutputParserException
from rich import print as rprint

from ...llms.factory import get_model
from ...loaders.factory import Document
from ...loaders.types import Document
from ..config.schema import LLMConfig, Participant, PodcastConfig
from .types import OutlineSegment, PodcastEpisode, PodcastOutline, ScriptSegment

Expand Down Expand Up @@ -47,7 +49,14 @@ def __init__(self, config: LLMConfig):

def generate_outline(self, document: Document) -> PodcastOutline:
"""Generate structured podcast outline from content summary"""
chain = self.outline_prompt | self.model | self.parser
chain = (
self.outline_prompt
| self.model
| self.parser.with_retry(
stop_after_attempt=2,
retry_if_exception_type=(JSONDecodeError, OutputParserException),
)
)
response = chain.invoke({"content": document.content, "title": document.title})
return response

Expand Down Expand Up @@ -112,7 +121,14 @@ def __init__(self, config: LLMConfig, participants: List[Participant]):
"""),
)

self.chain = self.segment_prompt | self.model | self.parser
self.chain = (
self.segment_prompt
| self.model
| self.parser.with_retry(
stop_after_attempt=2,
retry_if_exception_type=(JSONDecodeError, OutputParserException),
)
)

def create_host_profile(self, participant: Participant):
return f"HOST ({participant.name})[{participant.gender}]: {participant.personality}"
Expand Down
25 changes: 23 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ readme = "README.md"
podgen = "gyandex.cli.genpod:main"

[tool.poetry.dependencies]
python = "^3.11"
python = "^3.11,<3.14"
pandas = "^2.2.3"
notebook = "^7.2.2"
torch = "^2.5.0"
Expand All @@ -26,6 +26,7 @@ mutagen = "^1.47.0"
rich = {extras = ["jupyter"], version = "^13.9.3"}
python-slugify = "^8.0.4"
langchain-openai = "^0.2.12"
yt-dlp = "^2024.12.13"

[tool.poetry.group.dev.dependencies]
nbstripout = "^0.7.1"
Expand Down

0 comments on commit 3261755

Please sign in to comment.