From a0bebe713b12f5cd9bde03a25951eee5d02086ed Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Wed, 15 Jan 2025 14:39:14 -0300 Subject: [PATCH 01/16] Adiciona bundle do YouTube --- .../langflow/components/youtube/__init__.py | 17 ++ .../langflow/components/youtube/channel.py | 227 ++++++++++++++ .../langflow/components/youtube/comments.py | 231 +++++++++++++++ .../langflow/components/youtube/playlist.py | 31 ++ .../langflow/components/youtube/search.py | 118 ++++++++ .../langflow/components/youtube/trending.py | 277 ++++++++++++++++++ .../components/youtube/video_details.py | 252 ++++++++++++++++ .../components/youtube/youtube_transcripts.py | 243 +++++++++++++++ 8 files changed, 1396 insertions(+) create mode 100644 src/backend/base/langflow/components/youtube/__init__.py create mode 100644 src/backend/base/langflow/components/youtube/channel.py create mode 100644 src/backend/base/langflow/components/youtube/comments.py create mode 100644 src/backend/base/langflow/components/youtube/playlist.py create mode 100644 src/backend/base/langflow/components/youtube/search.py create mode 100644 src/backend/base/langflow/components/youtube/trending.py create mode 100644 src/backend/base/langflow/components/youtube/video_details.py create mode 100644 src/backend/base/langflow/components/youtube/youtube_transcripts.py diff --git a/src/backend/base/langflow/components/youtube/__init__.py b/src/backend/base/langflow/components/youtube/__init__.py new file mode 100644 index 000000000000..d53440b04597 --- /dev/null +++ b/src/backend/base/langflow/components/youtube/__init__.py @@ -0,0 +1,17 @@ +from .channel import YouTubeChannelComponent +from .comments import YouTubeCommentsComponent +from .playlist import YouTubePlaylistComponent +from .search import YouTubeSearchComponent +from .youtube_transcripts import YouTubeTranscriptsComponent +from .trending import YouTubeTrendingComponent +from .video_details import YouTubeVideoDetailsComponent + +__all__ = [ + "YouTubeChannelComponent", + "YouTubeCommentsComponent", + "YouTubePlaylistComponent", + "YouTubeSearchComponent", + "YouTubeTranscriptsComponent", + "YouTubeTrendingComponent", + "YouTubeVideoDetailsComponent", +] \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/channel.py b/src/backend/base/langflow/components/youtube/channel.py new file mode 100644 index 000000000000..1b7ff9311c42 --- /dev/null +++ b/src/backend/base/langflow/components/youtube/channel.py @@ -0,0 +1,227 @@ +from typing import Any +from urllib.error import HTTPError + +import pandas as pd +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from langflow.custom import Component +from langflow.inputs import BoolInput, MessageTextInput, SecretStrInput +from langflow.schema import DataFrame +from langflow.template import Output + + +class YouTubeError(Exception): + """Base exception class for YouTube-related errors.""" + + +class YouTubeAPIError(YouTubeError): + """Exception raised for YouTube API-related errors.""" + + +class YouTubeChannelComponent(Component): + """A component that retrieves detailed information about YouTube channels.""" + + display_name: str = "YouTube Channel" + description: str = "Retrieves detailed information and statistics about YouTube channels as a DataFrame." + icon: str = "YouTube" + name = "YouTubeChannel" + + # Constants + CHANNEL_ID_LENGTH = 24 + QUOTA_EXCEEDED_STATUS = 403 + NOT_FOUND_STATUS = 404 + MAX_PLAYLIST_RESULTS = 10 + + inputs = [ + MessageTextInput( + name="channel_url", + display_name="Channel URL or ID", + info="The URL or ID of the YouTube channel.", + tool_mode=True, + ), + SecretStrInput( + name="api_key", + display_name="YouTube API Key", + info="Your YouTube Data API key.", + required=True, + ), + BoolInput( + name="include_statistics", + display_name="Include Statistics", + value=True, + info="Include channel statistics (views, subscribers, videos).", + ), + BoolInput( + name="include_branding", + display_name="Include Branding", + value=True, + info="Include channel branding settings (banner, thumbnails).", + advanced=True, + ), + BoolInput( + name="include_playlists", + display_name="Include Playlists", + value=False, + info="Include channel's public playlists.", + advanced=True, + ), + ] + + outputs = [ + Output(name="channel_df", display_name="Channel Info", method="get_channel_info"), + ] + + def _extract_channel_id(self, channel_url: str) -> str: + """Extracts the channel ID from various YouTube channel URL formats.""" + import re + + if channel_url.startswith("UC") and len(channel_url) == self.CHANNEL_ID_LENGTH: + return channel_url + + patterns = { + "custom_url": r"youtube\.com\/c\/([^\/\n?]+)", + "channel_id": r"youtube\.com\/channel\/([^\/\n?]+)", + "user": r"youtube\.com\/user\/([^\/\n?]+)", + "handle": r"youtube\.com\/@([^\/\n?]+)", + } + + for pattern_type, pattern in patterns.items(): + match = re.search(pattern, channel_url) + if match: + if pattern_type == "channel_id": + return match.group(1) + return self._get_channel_id_by_name(match.group(1), pattern_type) + + return channel_url + + def _get_channel_id_by_name(self, channel_name: str, identifier_type: str) -> str: + """Gets the channel ID using the channel name or custom URL.""" + try: + youtube = build("youtube", "v3", developerKey=self.api_key) + + if identifier_type == "handle": + channel_name = channel_name.lstrip("@") + + request = youtube.search().list(part="id", q=channel_name, type="channel", maxResults=1) + response = request.execute() + + if response["items"]: + return response["items"][0]["id"]["channelId"] + + error_msg = f"Could not find channel ID for: {channel_name}" + raise YouTubeError(error_msg) + + except (HttpError, HTTPError) as e: + error_msg = f"YouTube API error while getting channel ID: {e!s}" + raise YouTubeAPIError(error_msg) from e + except Exception as e: + error_msg = f"Unexpected error while getting channel ID: {e!s}" + raise YouTubeError(error_msg) from e + + def _get_channel_playlists(self, youtube: Any, channel_id: str) -> list[dict[str, Any]]: + """Gets the public playlists for a channel.""" + try: + playlists_request = youtube.playlists().list( + part="snippet,contentDetails", + channelId=channel_id, + maxResults=self.MAX_PLAYLIST_RESULTS, + ) + playlists_response = playlists_request.execute() + playlists = [] + + for item in playlists_response.get("items", []): + playlist_data = { + "playlist_title": item["snippet"]["title"], + "playlist_description": item["snippet"]["description"], + "playlist_id": item["id"], + "playlist_video_count": item["contentDetails"]["itemCount"], + "playlist_published_at": item["snippet"]["publishedAt"], + "playlist_thumbnail_url": item["snippet"]["thumbnails"]["default"]["url"], + } + playlists.append(playlist_data) + + return playlists + except (HttpError, HTTPError) as e: + return [{"error": str(e)}] + else: + return playlists + + def get_channel_info(self) -> DataFrame: + """Retrieves channel information and returns it as a DataFrame.""" + try: + # Get channel ID and initialize YouTube API client + channel_id = self._extract_channel_id(self.channel_url) + youtube = build("youtube", "v3", developerKey=self.api_key) + + # Prepare parts for the API request + parts = ["snippet", "contentDetails"] + if self.include_statistics: + parts.append("statistics") + if self.include_branding: + parts.append("brandingSettings") + + # Get channel information + channel_response = youtube.channels().list(part=",".join(parts), id=channel_id).execute() + + if not channel_response["items"]: + return DataFrame(pd.DataFrame({"error": ["Channel not found"]})) + + channel_info = channel_response["items"][0] + + # Build basic channel data + channel_data = { + "title": [channel_info["snippet"]["title"]], + "description": [channel_info["snippet"]["description"]], + "custom_url": [channel_info["snippet"].get("customUrl", "")], + "published_at": [channel_info["snippet"]["publishedAt"]], + "country": [channel_info["snippet"].get("country", "Not specified")], + "channel_id": [channel_id], + } + + # Add thumbnails + for size, thumb in channel_info["snippet"]["thumbnails"].items(): + channel_data[f"thumbnail_{size}"] = [thumb["url"]] + + # Add statistics if requested + if self.include_statistics: + stats = channel_info["statistics"] + channel_data.update( + { + "view_count": [int(stats.get("viewCount", 0))], + "subscriber_count": [int(stats.get("subscriberCount", 0))], + "hidden_subscriber_count": [stats.get("hiddenSubscriberCount", False)], + "video_count": [int(stats.get("videoCount", 0))], + } + ) + + # Add branding if requested + if self.include_branding: + branding = channel_info.get("brandingSettings", {}) + channel_data.update( + { + "brand_title": [branding.get("channel", {}).get("title", "")], + "brand_description": [branding.get("channel", {}).get("description", "")], + "brand_keywords": [branding.get("channel", {}).get("keywords", "")], + "brand_banner_url": [branding.get("image", {}).get("bannerExternalUrl", "")], + } + ) + + # Create the initial DataFrame + channel_df = pd.DataFrame(channel_data) + + # Add playlists if requested + if self.include_playlists: + playlists = self._get_channel_playlists(youtube, channel_id) + if playlists and "error" not in playlists[0]: + # Create a DataFrame for playlists + playlists_df = pd.DataFrame(playlists) + # Join with main DataFrame + channel_df = pd.concat([channel_df] * len(playlists_df), ignore_index=True) + for column in playlists_df.columns: + channel_df[column] = playlists_df[column].to_numpy() + + return DataFrame(channel_df) + + except (HttpError, HTTPError, Exception) as e: + return DataFrame(pd.DataFrame({"error": [str(e)]})) \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/comments.py b/src/backend/base/langflow/components/youtube/comments.py new file mode 100644 index 000000000000..0aa92f0d89eb --- /dev/null +++ b/src/backend/base/langflow/components/youtube/comments.py @@ -0,0 +1,231 @@ +import pandas as pd +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from langflow.custom import Component +from langflow.inputs import BoolInput, DropdownInput, IntInput, MessageTextInput, SecretStrInput +from langflow.schema import DataFrame +from langflow.template import Output + + +class YouTubeError(Exception): + """Base exception class for YouTube-related errors.""" + + +class YouTubeAPIError(YouTubeError): + """Exception raised for YouTube API-related errors.""" + + +class YouTubeCommentsComponent(Component): + """A component that retrieves comments from YouTube videos.""" + + display_name: str = "YouTube Comments" + description: str = "Retrieves and analyzes comments from YouTube videos." + icon: str = "YouTube" + name = "YouTubeComments" + + # Constants + COMMENTS_DISABLED_STATUS = 403 + NOT_FOUND_STATUS = 404 + API_MAX_RESULTS = 100 + + inputs = [ + MessageTextInput( + name="video_url", + display_name="Video URL", + info="The URL of the YouTube video to get comments from.", + tool_mode=True, + ), + SecretStrInput( + name="api_key", + display_name="YouTube API Key", + info="Your YouTube Data API key.", + required=True, + ), + IntInput( + name="max_results", + display_name="Max Results", + value=20, + info="The maximum number of comments to return.", + ), + DropdownInput( + name="sort_by", + display_name="Sort By", + options=["time", "relevance"], + value="relevance", + info="Sort comments by time or relevance.", + ), + BoolInput( + name="include_replies", + display_name="Include Replies", + value=False, + info="Whether to include replies to comments.", + advanced=True, + ), + BoolInput( + name="include_metrics", + display_name="Include Metrics", + value=True, + info="Include metrics like like count and reply count.", + advanced=True, + ), + ] + + outputs = [ + Output(name="comments", display_name="Comments", method="get_video_comments"), + ] + + def _extract_video_id(self, video_url: str) -> str: + """Extracts the video ID from a YouTube URL.""" + import re + + patterns = [ + r"(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?#]+)", + r"youtube.com\/shorts\/([^&\n?#]+)", + ] + + for pattern in patterns: + match = re.search(pattern, video_url) + if match: + return match.group(1) + + return video_url.strip() + + def _process_reply(self, reply: dict, parent_id: str, *, include_metrics: bool = True) -> dict: + """Process a single reply comment.""" + reply_snippet = reply["snippet"] + reply_data = { + "comment_id": reply["id"], + "parent_comment_id": parent_id, + "author": reply_snippet["authorDisplayName"], + "text": reply_snippet["textDisplay"], + "published_at": reply_snippet["publishedAt"], + "is_reply": True, + } + if include_metrics: + reply_data["like_count"] = reply_snippet["likeCount"] + reply_data["reply_count"] = 0 # Replies can't have replies + + return reply_data + + def _process_comment( + self, item: dict, *, include_metrics: bool = True, include_replies: bool = False + ) -> list[dict]: + """Process a single comment thread.""" + comment = item["snippet"]["topLevelComment"]["snippet"] + comment_id = item["snippet"]["topLevelComment"]["id"] + + # Basic comment data + processed_comments = [ + { + "comment_id": comment_id, + "parent_comment_id": "", # Empty for top-level comments + "author": comment["authorDisplayName"], + "author_channel_url": comment.get("authorChannelUrl", ""), + "text": comment["textDisplay"], + "published_at": comment["publishedAt"], + "updated_at": comment["updatedAt"], + "is_reply": False, + } + ] + + # Add metrics if requested + if include_metrics: + processed_comments[0].update( + { + "like_count": comment["likeCount"], + "reply_count": item["snippet"]["totalReplyCount"], + } + ) + + # Add replies if requested + if include_replies and item["snippet"]["totalReplyCount"] > 0 and "replies" in item: + for reply in item["replies"]["comments"]: + reply_data = self._process_reply(reply, parent_id=comment_id, include_metrics=include_metrics) + processed_comments.append(reply_data) + + return processed_comments + + def get_video_comments(self) -> DataFrame: + """Retrieves comments from a YouTube video and returns as DataFrame.""" + try: + # Extract video ID from URL + video_id = self._extract_video_id(self.video_url) + + # Initialize YouTube API client + youtube = build("youtube", "v3", developerKey=self.api_key) + + # Prepare the initial request + request = youtube.commentThreads().list( + part="snippet,replies", + videoId=video_id, + maxResults=min(self.API_MAX_RESULTS, self.max_results), + order=self.sort_by, + textFormat="plainText", + ) + + comments_data = [] + results_count = 0 + + while request and results_count < self.max_results: + response = request.execute() + + for item in response.get("items", []): + if results_count >= self.max_results: + break + + comments = self._process_comment( + item, include_metrics=self.include_metrics, include_replies=self.include_replies + ) + comments_data.extend(comments) + results_count += 1 + + # Get the next page if available and needed + if "nextPageToken" in response and results_count < self.max_results: + request = youtube.commentThreads().list( + part="snippet,replies", + videoId=video_id, + maxResults=min(self.API_MAX_RESULTS, self.max_results - results_count), + order=self.sort_by, + textFormat="plainText", + pageToken=response["nextPageToken"], + ) + else: + request = None + + # Convert to DataFrame + comments_df = pd.DataFrame(comments_data) + + # Add video metadata + comments_df["video_id"] = video_id + comments_df["video_url"] = self.video_url + + # Sort columns for better organization + column_order = [ + "video_id", + "video_url", + "comment_id", + "parent_comment_id", + "is_reply", + "author", + "author_channel_url", + "text", + "published_at", + "updated_at", + ] + + if self.include_metrics: + column_order.extend(["like_count", "reply_count"]) + + comments_df = comments_df[column_order] + + return DataFrame(comments_df) + + except HttpError as e: + error_message = f"YouTube API error: {e!s}" + if e.resp.status == self.COMMENTS_DISABLED_STATUS: + error_message = "Comments are disabled for this video or API quota exceeded." + elif e.resp.status == self.NOT_FOUND_STATUS: + error_message = "Video not found." + + return DataFrame(pd.DataFrame({"error": [error_message]})) \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/playlist.py b/src/backend/base/langflow/components/youtube/playlist.py new file mode 100644 index 000000000000..597a09d4772a --- /dev/null +++ b/src/backend/base/langflow/components/youtube/playlist.py @@ -0,0 +1,31 @@ +from pytube import Playlist # Ensure you have pytube installed + +from langflow.custom import Component +from langflow.inputs import MessageTextInput +from langflow.schema import Data, DataFrame +from langflow.template import Output + + +class YouTubePlaylistComponent(Component): + display_name = "Youtube Playlist" + description = "Extracts all video URLs from a YouTube playlist." + icon = "YouTube" # Replace with a suitable icon + + inputs = [ + MessageTextInput( + name="playlist_url", + display_name="Playlist URL", + info="URL of the YouTube playlist.", + ), + ] + + outputs = [ + Output(display_name="Video URLs", name="video_urls", method="extract_video_urls"), + ] + + def extract_video_urls(self) -> DataFrame: + playlist_url = self.playlist_url + playlist = Playlist(playlist_url) + video_urls = [video.watch_url for video in playlist.videos] + + return DataFrame([Data(data={"video_url": url}) for url in video_urls]) \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/search.py b/src/backend/base/langflow/components/youtube/search.py new file mode 100644 index 000000000000..4f9209a14d3b --- /dev/null +++ b/src/backend/base/langflow/components/youtube/search.py @@ -0,0 +1,118 @@ +import pandas as pd +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from langflow.custom import Component +from langflow.inputs import IntInput, MessageTextInput, SecretStrInput +from langflow.schema import DataFrame +from langflow.template import Output + + +class YouTubeSearchComponent(Component): + """A component that searches YouTube and returns a list of video data.""" + + display_name: str = "YouTube Search" + description: str = "Searches YouTube and returns a list of video data based on a query." + icon: str = "YouTube" + name = "YouTubeSearch" + + inputs = [ + MessageTextInput( + name="query", + display_name="Search Query", + info="Enter the search query for YouTube videos.", + tool_mode=True, + ), + IntInput( + name="max_results", + display_name="Max Results", + value=5, + info="The maximum number of video results to return.", + ), + SecretStrInput( + name="api_key", + display_name="YouTube API Key", + info="Your YouTube Data API key.", + ), + ] + + outputs = [ + Output(name="video_data", display_name="Video Data", method="search_youtube"), + ] + + def search_youtube(self) -> DataFrame: + """Searches YouTube and returns video data as a DataFrame.""" + try: + # Initialize YouTube API client + youtube = build("youtube", "v3", developerKey=self.api_key) + + # Perform initial search + search_response = ( + youtube.search() + .list(q=self.query, type="video", part="id,snippet", maxResults=self.max_results) + .execute() + ) + + # Prepare data for DataFrame + video_data_list = [] + for search_result in search_response.get("items", []): + video_id = search_result["id"]["videoId"] + snippet = search_result["snippet"] + + video_data = { + "video_id": video_id, + "url": f"https://www.youtube.com/watch?v={video_id}", + "title": snippet["title"], + "description": snippet["description"], + "channel_id": snippet["channelId"], + "channel_title": snippet["channelTitle"], + "published_at": snippet["publishedAt"], + "search_query": self.query, + } + + # Add thumbnails + thumbnails = snippet["thumbnails"] + for size, thumb in thumbnails.items(): + video_data[f"thumbnail_{size}_url"] = thumb["url"] + video_data[f"thumbnail_{size}_width"] = thumb.get("width", 0) + video_data[f"thumbnail_{size}_height"] = thumb.get("height", 0) + + video_data_list.append(video_data) + + if not video_data_list: + return DataFrame(pd.DataFrame({"error": ["No results found"]})) + + # Create DataFrame + video_df = pd.DataFrame(video_data_list) + + # Organize columns in logical groups + base_cols = [ + "video_id", + "title", + "url", + "channel_id", + "channel_title", + "published_at", + "search_query", + "description", + ] + + thumb_cols = sorted([col for col in video_df.columns if col.startswith("thumbnail_")]) + + # Get remaining columns that don't fit in any category + all_defined_cols = base_cols + thumb_cols + other_cols = [col for col in video_df.columns if col not in all_defined_cols] + + # Combine all columns in desired order + ordered_cols = base_cols + thumb_cols + other_cols + + # Reorder DataFrame columns + video_df = video_df[ordered_cols] + + return DataFrame(video_df) + + except HttpError as e: + return DataFrame(pd.DataFrame({"error": [f"An HTTP error occurred: {e!s}"]})) + + except (KeyError, pd.errors.EmptyDataError) as e: + return DataFrame(pd.DataFrame({"error": [f"An unexpected error occurred: {e!s}"]})) \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/trending.py b/src/backend/base/langflow/components/youtube/trending.py new file mode 100644 index 000000000000..2fb175ecc5dc --- /dev/null +++ b/src/backend/base/langflow/components/youtube/trending.py @@ -0,0 +1,277 @@ +import pandas as pd +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from langflow.custom import Component +from langflow.inputs import BoolInput, DropdownInput, IntInput, SecretStrInput +from langflow.schema import DataFrame +from langflow.template import Output + +HTTP_FORBIDDEN = 403 +HTTP_NOT_FOUND = 404 +MAX_API_RESULTS = 50 + + +class YouTubeTrendingComponent(Component): + """A component that retrieves trending videos from YouTube.""" + + display_name: str = "YouTube Trending" + description: str = "Retrieves trending videos from YouTube with filtering options." + icon: str = "YouTube" + name = "YouTubeTrending" + + # Dictionary of country codes and names + COUNTRY_CODES = { + "Global": "US", # Default to US for global + "United States": "US", + "Brazil": "BR", + "United Kingdom": "GB", + "India": "IN", + "Japan": "JP", + "South Korea": "KR", + "Germany": "DE", + "France": "FR", + "Canada": "CA", + "Australia": "AU", + "Spain": "ES", + "Italy": "IT", + "Mexico": "MX", + "Russia": "RU", + "Netherlands": "NL", + "Poland": "PL", + "Argentina": "AR", + } + + # Dictionary of video categories + VIDEO_CATEGORIES = { + "All": "0", + "Film & Animation": "1", + "Autos & Vehicles": "2", + "Music": "10", + "Pets & Animals": "15", + "Sports": "17", + "Travel & Events": "19", + "Gaming": "20", + "People & Blogs": "22", + "Comedy": "23", + "Entertainment": "24", + "News & Politics": "25", + "Education": "27", + "Science & Technology": "28", + "Nonprofits & Activism": "29", + } + + inputs = [ + SecretStrInput( + name="api_key", + display_name="YouTube API Key", + info="Your YouTube Data API key.", + required=True, + ), + DropdownInput( + name="region", + display_name="Region", + options=list(COUNTRY_CODES.keys()), + value="Global", + info="The region to get trending videos from.", + ), + DropdownInput( + name="category", + display_name="Category", + options=list(VIDEO_CATEGORIES.keys()), + value="All", + info="The category of videos to retrieve.", + ), + IntInput( + name="max_results", + display_name="Max Results", + value=10, + info="Maximum number of trending videos to return (1-50).", + ), + BoolInput( + name="include_statistics", + display_name="Include Statistics", + value=True, + info="Include video statistics (views, likes, comments).", + ), + BoolInput( + name="include_content_details", + display_name="Include Content Details", + value=True, + info="Include video duration and quality info.", + advanced=True, + ), + BoolInput( + name="include_thumbnails", + display_name="Include Thumbnails", + value=True, + info="Include video thumbnail URLs.", + advanced=True, + ), + ] + + outputs = [ + Output(name="trending_videos", display_name="Trending Videos", method="get_trending_videos"), + ] + + max_results: int + + def _format_duration(self, duration: str) -> str: + """Formats ISO 8601 duration to readable format.""" + import re + + # Remove 'PT' from the start of duration + duration = duration[2:] + + hours = 0 + minutes = 0 + seconds = 0 + + # Extract hours, minutes and seconds + time_dict = {} + for time_unit in ["H", "M", "S"]: + match = re.search(r"(\d+)" + time_unit, duration) + if match: + time_dict[time_unit] = int(match.group(1)) + + if "H" in time_dict: + hours = time_dict["H"] + if "M" in time_dict: + minutes = time_dict["M"] + if "S" in time_dict: + seconds = time_dict["S"] + + # Format the time string + if hours > 0: + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + return f"{minutes:02d}:{seconds:02d}" + + def get_trending_videos(self) -> DataFrame: + """Retrieves trending videos from YouTube and returns as DataFrame.""" + try: + # Validate max_results + if not 1 <= self.max_results <= MAX_API_RESULTS: + self.max_results = min(max(1, self.max_results), MAX_API_RESULTS) + + # Build YouTube API client + youtube = build("youtube", "v3", developerKey=self.api_key) + + # Get country code + region_code = self.COUNTRY_CODES[self.region] + + # Prepare API request parts + parts = ["snippet"] + if self.include_statistics: + parts.append("statistics") + if self.include_content_details: + parts.append("contentDetails") + + # Prepare API request parameters + request_params = { + "part": ",".join(parts), + "chart": "mostPopular", + "regionCode": region_code, + "maxResults": self.max_results, + } + + # Add category filter if not "All" + if self.category != "All": + request_params["videoCategoryId"] = self.VIDEO_CATEGORIES[self.category] + + # Get trending videos + request = youtube.videos().list(**request_params) + response = request.execute() + + videos_data = [] + for item in response.get("items", []): + video_data = { + "video_id": item["id"], + "title": item["snippet"]["title"], + "description": item["snippet"]["description"], + "channel_id": item["snippet"]["channelId"], + "channel_title": item["snippet"]["channelTitle"], + "published_at": item["snippet"]["publishedAt"], + "url": f"https://www.youtube.com/watch?v={item['id']}", + "region": self.region, + "category": self.category, + } + + # Add thumbnails if requested + if self.include_thumbnails: + for size, thumb in item["snippet"]["thumbnails"].items(): + video_data[f"thumbnail_{size}_url"] = thumb["url"] + video_data[f"thumbnail_{size}_width"] = thumb.get("width", 0) + video_data[f"thumbnail_{size}_height"] = thumb.get("height", 0) + + # Add statistics if requested + if self.include_statistics and "statistics" in item: + video_data.update( + { + "view_count": int(item["statistics"].get("viewCount", 0)), + "like_count": int(item["statistics"].get("likeCount", 0)), + "comment_count": int(item["statistics"].get("commentCount", 0)), + } + ) + + # Add content details if requested + if self.include_content_details and "contentDetails" in item: + content_details = item["contentDetails"] + video_data.update( + { + "duration": self._format_duration(content_details["duration"]), + "definition": content_details.get("definition", "hd").upper(), + "has_captions": content_details.get("caption", "false") == "true", + "licensed_content": content_details.get("licensedContent", False), + "projection": content_details.get("projection", "rectangular"), + } + ) + + videos_data.append(video_data) + + # Convert to DataFrame + videos_df = pd.DataFrame(videos_data) + + # Organize columns + column_order = [ + "video_id", + "title", + "channel_id", + "channel_title", + "category", + "region", + "published_at", + "url", + "description", + ] + + if self.include_statistics: + column_order.extend(["view_count", "like_count", "comment_count"]) + + if self.include_content_details: + column_order.extend(["duration", "definition", "has_captions", "licensed_content", "projection"]) + + # Add thumbnail columns at the end if included + if self.include_thumbnails: + thumbnail_cols = [col for col in videos_df.columns if col.startswith("thumbnail_")] + column_order.extend(sorted(thumbnail_cols)) + + # Reorder columns, including any that might not be in column_order + remaining_cols = [col for col in videos_df.columns if col not in column_order] + videos_df = videos_df[column_order + remaining_cols] + + return DataFrame(videos_df) + + except HttpError as e: + error_message = f"YouTube API error: {e}" + if e.resp.status == HTTP_FORBIDDEN: + error_message = "API quota exceeded or access forbidden." + elif e.resp.status == HTTP_NOT_FOUND: + error_message = "Resource not found." + + return DataFrame(pd.DataFrame({"error": [error_message]})) + + except Exception as e: + import logging + + logging.exception("An unexpected error occurred:") + return DataFrame(pd.DataFrame({"error": [str(e)]})) diff --git a/src/backend/base/langflow/components/youtube/video_details.py b/src/backend/base/langflow/components/youtube/video_details.py new file mode 100644 index 000000000000..b375e79fdda7 --- /dev/null +++ b/src/backend/base/langflow/components/youtube/video_details.py @@ -0,0 +1,252 @@ +import googleapiclient +import pandas as pd +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from langflow.custom import Component +from langflow.inputs import BoolInput, MessageTextInput, SecretStrInput +from langflow.schema import DataFrame +from langflow.template import Output + + +class YouTubeVideoDetailsComponent(Component): + """A component that retrieves detailed information about YouTube videos.""" + + display_name: str = "YouTube Video Details" + description: str = "Retrieves detailed information and statistics about YouTube videos." + icon: str = "YouTube" + name = "YouTubeVideoDetails" + + inputs = [ + MessageTextInput( + name="video_url", + display_name="Video URL", + info="The URL of the YouTube video.", + tool_mode=True, + ), + SecretStrInput( + name="api_key", + display_name="YouTube API Key", + info="Your YouTube Data API key.", + required=True, + ), + BoolInput( + name="include_statistics", + display_name="Include Statistics", + value=True, + info="Include video statistics (views, likes, comments).", + ), + BoolInput( + name="include_content_details", + display_name="Include Content Details", + value=True, + info="Include video duration, quality, and age restriction info.", + advanced=True, + ), + BoolInput( + name="include_tags", + display_name="Include Tags", + value=True, + info="Include video tags and keywords.", + advanced=True, + ), + BoolInput( + name="include_thumbnails", + display_name="Include Thumbnails", + value=True, + info="Include video thumbnail URLs in different resolutions.", + advanced=True, + ), + ] + + outputs = [ + Output(name="video_data", display_name="Video Data", method="get_video_details"), + ] + + API_FORBIDDEN = 403 + VIDEO_NOT_FOUND = 404 + + def _extract_video_id(self, video_url: str) -> str: + """Extracts the video ID from a YouTube URL.""" + import re + + patterns = [ + r"(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?#]+)", + r"youtube.com\/shorts\/([^&\n?#]+)", + ] + + for pattern in patterns: + match = re.search(pattern, video_url) + if match: + return match.group(1) + + return video_url.strip() + + def _format_duration(self, duration: str) -> str: + """Formats the ISO 8601 duration to a readable format.""" + import re + + hours = 0 + minutes = 0 + seconds = 0 + + hours_match = re.search(r"(\d+)H", duration) + minutes_match = re.search(r"(\d+)M", duration) + seconds_match = re.search(r"(\d+)S", duration) + + if hours_match: + hours = int(hours_match.group(1)) + if minutes_match: + minutes = int(minutes_match.group(1)) + if seconds_match: + seconds = int(seconds_match.group(1)) + + if hours > 0: + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + return f"{minutes:02d}:{seconds:02d}" + + def get_video_details(self) -> DataFrame: + """Retrieves detailed information about a YouTube video and returns as DataFrame.""" + try: + # Extract video ID and initialize API + video_id = self._extract_video_id(self.video_url) + youtube = build("youtube", "v3", developerKey=self.api_key) + + # Prepare parts for the API request + parts = ["snippet"] + if self.include_statistics: + parts.append("statistics") + if self.include_content_details: + parts.append("contentDetails") + + # Get video information + video_response = youtube.videos().list(part=",".join(parts), id=video_id).execute() + + if not video_response["items"]: + return DataFrame(pd.DataFrame({"error": ["Video not found"]})) + + video_info = video_response["items"][0] + snippet = video_info["snippet"] + + # Build video data dictionary + video_data = { + "video_id": [video_id], + "url": [f"https://www.youtube.com/watch?v={video_id}"], + "title": [snippet["title"]], + "description": [snippet["description"]], + "published_at": [snippet["publishedAt"]], + "channel_id": [snippet["channelId"]], + "channel_title": [snippet["channelTitle"]], + "category_id": [snippet.get("categoryId", "Unknown")], + "live_broadcast_content": [snippet.get("liveBroadcastContent", "none")], + } + + # Add thumbnails if requested + if self.include_thumbnails: + for size, thumb in snippet["thumbnails"].items(): + video_data[f"thumbnail_{size}_url"] = [thumb["url"]] + video_data[f"thumbnail_{size}_width"] = [thumb.get("width", 0)] + video_data[f"thumbnail_{size}_height"] = [thumb.get("height", 0)] + + # Add tags if requested + if self.include_tags and "tags" in snippet: + video_data["tags"] = [", ".join(snippet["tags"])] + video_data["tags_count"] = [len(snippet["tags"])] + + # Add statistics if requested + if self.include_statistics and "statistics" in video_info: + stats = video_info["statistics"] + video_data.update( + { + "view_count": [int(stats.get("viewCount", 0))], + "like_count": [int(stats.get("likeCount", 0))], + "favorite_count": [int(stats.get("favoriteCount", 0))], + "comment_count": [int(stats.get("commentCount", 0))], + } + ) + + # Add content details if requested + if self.include_content_details and "contentDetails" in video_info: + content_details = video_info["contentDetails"] + video_data.update( + { + "duration": [self._format_duration(content_details["duration"])], + "dimension": [content_details.get("dimension", "2d")], + "definition": [content_details.get("definition", "hd").upper()], + "has_captions": [content_details.get("caption", "false") == "true"], + "licensed_content": [content_details.get("licensedContent", False)], + "projection": [content_details.get("projection", "rectangular")], + "has_custom_thumbnails": [content_details.get("hasCustomThumbnail", False)], + } + ) + + # Add content rating if available + if "contentRating" in content_details: + rating_info = content_details["contentRating"] + video_data["content_rating"] = [str(rating_info)] + + # Create DataFrame with organized columns + video_df = pd.DataFrame(video_data) + + # Organize columns in logical groups + basic_cols = [ + "video_id", + "title", + "url", + "channel_id", + "channel_title", + "published_at", + "category_id", + "live_broadcast_content", + "description", + ] + + stat_cols = ["view_count", "like_count", "favorite_count", "comment_count"] + + content_cols = [ + "duration", + "dimension", + "definition", + "has_captions", + "licensed_content", + "projection", + "has_custom_thumbnails", + "content_rating", + ] + + tag_cols = ["tags", "tags_count"] + + thumb_cols = [col for col in video_df.columns if col.startswith("thumbnail_")] + + # Reorder columns based on what's included + ordered_cols = basic_cols[:] + + if self.include_statistics: + ordered_cols.extend([col for col in stat_cols if col in video_df.columns]) + + if self.include_content_details: + ordered_cols.extend([col for col in content_cols if col in video_df.columns]) + + if self.include_tags: + ordered_cols.extend([col for col in tag_cols if col in video_df.columns]) + + if self.include_thumbnails: + ordered_cols.extend(sorted(thumb_cols)) + + # Add any remaining columns + remaining_cols = [col for col in video_df.columns if col not in ordered_cols] + ordered_cols.extend(remaining_cols) + + return DataFrame(video_df[ordered_cols]) + + except (HttpError, googleapiclient.errors.HttpError) as e: + error_message = f"YouTube API error: {e!s}" + if e.resp.status == self.API_FORBIDDEN: + error_message = "API quota exceeded or access forbidden." + elif e.resp.status == self.VIDEO_NOT_FOUND: + error_message = "Video not found." + + return DataFrame(pd.DataFrame({"error": [error_message]})) + + except KeyError as e: + return DataFrame(pd.DataFrame({"error": [str(e)]})) \ No newline at end of file diff --git a/src/backend/base/langflow/components/youtube/youtube_transcripts.py b/src/backend/base/langflow/components/youtube/youtube_transcripts.py new file mode 100644 index 000000000000..a2cff988a075 --- /dev/null +++ b/src/backend/base/langflow/components/youtube/youtube_transcripts.py @@ -0,0 +1,243 @@ +from langchain_community.document_loaders import YoutubeLoader +from langchain_community.document_loaders.youtube import TranscriptFormat + +from langflow.custom import Component +from langflow.inputs import DropdownInput, IntInput, MultilineInput +from langflow.schema import Message +from langflow.template import Output + + +class YouTubeTranscriptsComponent(Component): + """A component that extracts spoken content from YouTube videos as transcripts.""" + + display_name: str = "YouTube Transcripts" + description: str = "Extracts spoken content from YouTube videos as transcripts." + icon: str = "YouTube" + name = "YouTubeTranscripts" + + inputs = [ + MultilineInput( + name="url", + display_name="Video URL", + info="Enter the YouTube video URL to get transcripts from.", + tool_mode=True, + ), + DropdownInput( + name="transcript_format", + display_name="Transcript Format", + options=["text", "chunks"], + value="text", + info="The format of the transcripts. Either 'text' for a single output or 'chunks' for timestamped chunks.", + advanced=True, + ), + IntInput( + name="chunk_size_seconds", + display_name="Chunk Size (seconds)", + value=60, + advanced=True, + info="The size of each transcript chunk in seconds. Only applicable when " + "'Transcript Format' is set to 'chunks'.", + ), + DropdownInput( + name="language", + display_name="Language", + options=[ + "af", + "ak", + "sq", + "am", + "ar", + "hy", + "as", + "ay", + "az", + "bn", + "eu", + "be", + "bho", + "bs", + "bg", + "my", + "ca", + "ceb", + "zh", + "zh-HK", + "zh-CN", + "zh-SG", + "zh-TW", + "zh-Hans", + "zh-Hant", + "hak-TW", + "nan-TW", + "co", + "hr", + "cs", + "da", + "dv", + "nl", + "en", + "en-US", + "eo", + "et", + "ee", + "fil", + "fi", + "fr", + "gl", + "lg", + "ka", + "de", + "el", + "gn", + "gu", + "ht", + "ha", + "haw", + "iw", + "hi", + "hmn", + "hu", + "is", + "ig", + "id", + "ga", + "it", + "ja", + "jv", + "kn", + "kk", + "km", + "rw", + "ko", + "kri", + "ku", + "ky", + "lo", + "la", + "lv", + "ln", + "lt", + "lb", + "mk", + "mg", + "ms", + "ml", + "mt", + "mi", + "mr", + "mn", + "ne", + "nso", + "no", + "ny", + "or", + "om", + "ps", + "fa", + "pl", + "pt", + "pa", + "qu", + "ro", + "ru", + "sm", + "sa", + "gd", + "sr", + "sn", + "sd", + "si", + "sk", + "sl", + "so", + "st", + "es", + "su", + "sw", + "sv", + "tg", + "ta", + "tt", + "te", + "th", + "ti", + "ts", + "tr", + "tk", + "uk", + "ur", + "ug", + "uz", + "vi", + "cy", + "fy", + "xh", + "yi", + "yo", + "zu", + ], + value="en", + info=( + "Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'" + ), + ), + DropdownInput( + name="translation", + display_name="Translation Language", + advanced=True, + options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"], + info="Translate the transcripts to the specified language. Leave empty for no translation.", + ), + ] + + outputs = [ + Output(name="transcripts", display_name="Transcription", method="build_youtube_transcripts"), + ] + + def build_youtube_transcripts(self) -> Message: + """Method to extracts transcripts from a YouTube video URL. + + Returns: + Message: The transcripts of the video as a text string. If 'transcript_format' + is 'text', the transcripts are returned as a single continuous string. If + 'transcript_format' is 'chunks', the transcripts are returned as a string + with timestamped segments. + + Raises: + Exception: Returns an error message if transcript retrieval fails. + """ + try: + # Attempt to load transcripts in the specified language, fallback to any available language + languages = [self.language] if self.language else None + loader = YoutubeLoader.from_youtube_url( + self.url, + transcript_format=TranscriptFormat.TEXT + if self.transcript_format == "text" + else TranscriptFormat.CHUNKS, + chunk_size_seconds=self.chunk_size_seconds, + language=languages, + translation=self.translation or None, + ) + + transcripts = loader.load() + + if self.transcript_format == "text": + # Extract only the page_content from the Document + result = transcripts[0].page_content + return Message(text=result) + + # For chunks, format the output with timestamps + formatted_chunks = [] + for doc in transcripts: + start_seconds = int(doc.metadata["start_seconds"]) + start_minutes = start_seconds // 60 + start_seconds %= 60 + timestamp = f"{start_minutes:02d}:{start_seconds:02d}" + formatted_chunks.append(f"{timestamp} {doc.page_content}") + result = "\n".join(formatted_chunks) + return Message(text=result) + + except Exception as exc: # noqa: BLE001 + # Using a specific error type for the return value + error_msg = f"Failed to get YouTube transcripts: {exc!s}" + return Message(text=error_msg) From 2065f5fb85331b8fcd44cf6efed5e22ae3e6ff01 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 15 Jan 2025 17:45:26 +0000 Subject: [PATCH 02/16] [autofix.ci] apply automated fixes --- src/backend/base/langflow/components/youtube/__init__.py | 4 ++-- src/backend/base/langflow/components/youtube/channel.py | 2 +- src/backend/base/langflow/components/youtube/comments.py | 2 +- src/backend/base/langflow/components/youtube/playlist.py | 2 +- src/backend/base/langflow/components/youtube/search.py | 2 +- src/backend/base/langflow/components/youtube/video_details.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/backend/base/langflow/components/youtube/__init__.py b/src/backend/base/langflow/components/youtube/__init__.py index d53440b04597..4c4ab6f3326c 100644 --- a/src/backend/base/langflow/components/youtube/__init__.py +++ b/src/backend/base/langflow/components/youtube/__init__.py @@ -2,9 +2,9 @@ from .comments import YouTubeCommentsComponent from .playlist import YouTubePlaylistComponent from .search import YouTubeSearchComponent -from .youtube_transcripts import YouTubeTranscriptsComponent from .trending import YouTubeTrendingComponent from .video_details import YouTubeVideoDetailsComponent +from .youtube_transcripts import YouTubeTranscriptsComponent __all__ = [ "YouTubeChannelComponent", @@ -14,4 +14,4 @@ "YouTubeTranscriptsComponent", "YouTubeTrendingComponent", "YouTubeVideoDetailsComponent", -] \ No newline at end of file +] diff --git a/src/backend/base/langflow/components/youtube/channel.py b/src/backend/base/langflow/components/youtube/channel.py index 1b7ff9311c42..f70fe99cbead 100644 --- a/src/backend/base/langflow/components/youtube/channel.py +++ b/src/backend/base/langflow/components/youtube/channel.py @@ -224,4 +224,4 @@ def get_channel_info(self) -> DataFrame: return DataFrame(channel_df) except (HttpError, HTTPError, Exception) as e: - return DataFrame(pd.DataFrame({"error": [str(e)]})) \ No newline at end of file + return DataFrame(pd.DataFrame({"error": [str(e)]})) diff --git a/src/backend/base/langflow/components/youtube/comments.py b/src/backend/base/langflow/components/youtube/comments.py index 0aa92f0d89eb..456fe1a5f0ba 100644 --- a/src/backend/base/langflow/components/youtube/comments.py +++ b/src/backend/base/langflow/components/youtube/comments.py @@ -228,4 +228,4 @@ def get_video_comments(self) -> DataFrame: elif e.resp.status == self.NOT_FOUND_STATUS: error_message = "Video not found." - return DataFrame(pd.DataFrame({"error": [error_message]})) \ No newline at end of file + return DataFrame(pd.DataFrame({"error": [error_message]})) diff --git a/src/backend/base/langflow/components/youtube/playlist.py b/src/backend/base/langflow/components/youtube/playlist.py index 597a09d4772a..604128cd16d6 100644 --- a/src/backend/base/langflow/components/youtube/playlist.py +++ b/src/backend/base/langflow/components/youtube/playlist.py @@ -28,4 +28,4 @@ def extract_video_urls(self) -> DataFrame: playlist = Playlist(playlist_url) video_urls = [video.watch_url for video in playlist.videos] - return DataFrame([Data(data={"video_url": url}) for url in video_urls]) \ No newline at end of file + return DataFrame([Data(data={"video_url": url}) for url in video_urls]) diff --git a/src/backend/base/langflow/components/youtube/search.py b/src/backend/base/langflow/components/youtube/search.py index 4f9209a14d3b..765ac75e08ec 100644 --- a/src/backend/base/langflow/components/youtube/search.py +++ b/src/backend/base/langflow/components/youtube/search.py @@ -115,4 +115,4 @@ def search_youtube(self) -> DataFrame: return DataFrame(pd.DataFrame({"error": [f"An HTTP error occurred: {e!s}"]})) except (KeyError, pd.errors.EmptyDataError) as e: - return DataFrame(pd.DataFrame({"error": [f"An unexpected error occurred: {e!s}"]})) \ No newline at end of file + return DataFrame(pd.DataFrame({"error": [f"An unexpected error occurred: {e!s}"]})) diff --git a/src/backend/base/langflow/components/youtube/video_details.py b/src/backend/base/langflow/components/youtube/video_details.py index b375e79fdda7..5d60fb489662 100644 --- a/src/backend/base/langflow/components/youtube/video_details.py +++ b/src/backend/base/langflow/components/youtube/video_details.py @@ -249,4 +249,4 @@ def get_video_details(self) -> DataFrame: return DataFrame(pd.DataFrame({"error": [error_message]})) except KeyError as e: - return DataFrame(pd.DataFrame({"error": [str(e)]})) \ No newline at end of file + return DataFrame(pd.DataFrame({"error": [str(e)]})) From 9629742705000dcf187e83955fec341616a5597c Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Wed, 15 Jan 2025 20:53:38 -0300 Subject: [PATCH 03/16] feat: add YouTube bundle configuration to constants and style utils --- src/frontend/src/constants/constants.ts | 2 ++ src/frontend/src/utils/styleUtils.ts | 1 + 2 files changed, 3 insertions(+) diff --git a/src/frontend/src/constants/constants.ts b/src/frontend/src/constants/constants.ts index 10e11ff739ab..f70d2bb6364b 100644 --- a/src/frontend/src/constants/constants.ts +++ b/src/frontend/src/constants/constants.ts @@ -742,6 +742,8 @@ export const BUNDLES_SIDEBAR_FOLDER_NAMES = [ "assemblyai", "LangWatch", "langwatch", + "Youtube", + "youtube", ]; export const AUTHORIZED_DUPLICATE_REQUESTS = [ diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts index b8e0ddaf66b7..b53fe4e04f7a 100644 --- a/src/frontend/src/utils/styleUtils.ts +++ b/src/frontend/src/utils/styleUtils.ts @@ -535,6 +535,7 @@ export const SIDEBAR_BUNDLES = [ { display_name: "Git", name: "git", icon: "GitLoader" }, { display_name: "Confluence", name: "confluence", icon: "Confluence" }, { display_name: "Mem0", name: "mem0", icon: "Mem0" }, + { display_name: "Youtube", name: "youtube", icon: "Youtube" }, ]; export const categoryIcons = { From 2003118e239affeb6fd5e30914f0ea172f94d3c8 Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Thu, 16 Jan 2025 10:15:54 -0300 Subject: [PATCH 04/16] fix: remove name attribute from YouTube components --- src/backend/base/langflow/components/youtube/channel.py | 1 - src/backend/base/langflow/components/youtube/comments.py | 1 - src/backend/base/langflow/components/youtube/search.py | 1 - src/backend/base/langflow/components/youtube/trending.py | 1 - src/backend/base/langflow/components/youtube/video_details.py | 1 - .../base/langflow/components/youtube/youtube_transcripts.py | 1 - 6 files changed, 6 deletions(-) diff --git a/src/backend/base/langflow/components/youtube/channel.py b/src/backend/base/langflow/components/youtube/channel.py index f70fe99cbead..991b5f280111 100644 --- a/src/backend/base/langflow/components/youtube/channel.py +++ b/src/backend/base/langflow/components/youtube/channel.py @@ -25,7 +25,6 @@ class YouTubeChannelComponent(Component): display_name: str = "YouTube Channel" description: str = "Retrieves detailed information and statistics about YouTube channels as a DataFrame." icon: str = "YouTube" - name = "YouTubeChannel" # Constants CHANNEL_ID_LENGTH = 24 diff --git a/src/backend/base/langflow/components/youtube/comments.py b/src/backend/base/langflow/components/youtube/comments.py index 456fe1a5f0ba..415a20495cd6 100644 --- a/src/backend/base/langflow/components/youtube/comments.py +++ b/src/backend/base/langflow/components/youtube/comments.py @@ -22,7 +22,6 @@ class YouTubeCommentsComponent(Component): display_name: str = "YouTube Comments" description: str = "Retrieves and analyzes comments from YouTube videos." icon: str = "YouTube" - name = "YouTubeComments" # Constants COMMENTS_DISABLED_STATUS = 403 diff --git a/src/backend/base/langflow/components/youtube/search.py b/src/backend/base/langflow/components/youtube/search.py index 765ac75e08ec..89ebbc7fd6b1 100644 --- a/src/backend/base/langflow/components/youtube/search.py +++ b/src/backend/base/langflow/components/youtube/search.py @@ -14,7 +14,6 @@ class YouTubeSearchComponent(Component): display_name: str = "YouTube Search" description: str = "Searches YouTube and returns a list of video data based on a query." icon: str = "YouTube" - name = "YouTubeSearch" inputs = [ MessageTextInput( diff --git a/src/backend/base/langflow/components/youtube/trending.py b/src/backend/base/langflow/components/youtube/trending.py index 2fb175ecc5dc..986991e8a948 100644 --- a/src/backend/base/langflow/components/youtube/trending.py +++ b/src/backend/base/langflow/components/youtube/trending.py @@ -18,7 +18,6 @@ class YouTubeTrendingComponent(Component): display_name: str = "YouTube Trending" description: str = "Retrieves trending videos from YouTube with filtering options." icon: str = "YouTube" - name = "YouTubeTrending" # Dictionary of country codes and names COUNTRY_CODES = { diff --git a/src/backend/base/langflow/components/youtube/video_details.py b/src/backend/base/langflow/components/youtube/video_details.py index 5d60fb489662..85bde4239df4 100644 --- a/src/backend/base/langflow/components/youtube/video_details.py +++ b/src/backend/base/langflow/components/youtube/video_details.py @@ -15,7 +15,6 @@ class YouTubeVideoDetailsComponent(Component): display_name: str = "YouTube Video Details" description: str = "Retrieves detailed information and statistics about YouTube videos." icon: str = "YouTube" - name = "YouTubeVideoDetails" inputs = [ MessageTextInput( diff --git a/src/backend/base/langflow/components/youtube/youtube_transcripts.py b/src/backend/base/langflow/components/youtube/youtube_transcripts.py index a2cff988a075..57fb44b80b97 100644 --- a/src/backend/base/langflow/components/youtube/youtube_transcripts.py +++ b/src/backend/base/langflow/components/youtube/youtube_transcripts.py @@ -13,7 +13,6 @@ class YouTubeTranscriptsComponent(Component): display_name: str = "YouTube Transcripts" description: str = "Extracts spoken content from YouTube videos as transcripts." icon: str = "YouTube" - name = "YouTubeTranscripts" inputs = [ MultilineInput( From 7909f4b13b276d2e163d77dccddd249339e6d3d9 Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Thu, 16 Jan 2025 11:52:12 -0300 Subject: [PATCH 05/16] fix: properly close SSL sockets to resolve ResourceWarnings in YouTube components --- .../langflow/components/youtube/channel.py | 22 +- .../langflow/components/youtube/comments.py | 152 +++++----- .../langflow/components/youtube/search.py | 170 +++++------ .../langflow/components/youtube/trending.py | 218 +++++++------- .../components/youtube/video_details.py | 271 +++++++++--------- 5 files changed, 428 insertions(+), 405 deletions(-) diff --git a/src/backend/base/langflow/components/youtube/channel.py b/src/backend/base/langflow/components/youtube/channel.py index 991b5f280111..41ae07e04e9f 100644 --- a/src/backend/base/langflow/components/youtube/channel.py +++ b/src/backend/base/langflow/components/youtube/channel.py @@ -11,14 +11,6 @@ from langflow.template import Output -class YouTubeError(Exception): - """Base exception class for YouTube-related errors.""" - - -class YouTubeAPIError(YouTubeError): - """Exception raised for YouTube API-related errors.""" - - class YouTubeChannelComponent(Component): """A component that retrieves detailed information about YouTube channels.""" @@ -96,6 +88,7 @@ def _extract_channel_id(self, channel_url: str) -> str: def _get_channel_id_by_name(self, channel_name: str, identifier_type: str) -> str: """Gets the channel ID using the channel name or custom URL.""" + youtube = None try: youtube = build("youtube", "v3", developerKey=self.api_key) @@ -109,14 +102,17 @@ def _get_channel_id_by_name(self, channel_name: str, identifier_type: str) -> st return response["items"][0]["id"]["channelId"] error_msg = f"Could not find channel ID for: {channel_name}" - raise YouTubeError(error_msg) + raise ValueError(error_msg) except (HttpError, HTTPError) as e: error_msg = f"YouTube API error while getting channel ID: {e!s}" - raise YouTubeAPIError(error_msg) from e + raise RuntimeError(error_msg) from e except Exception as e: error_msg = f"Unexpected error while getting channel ID: {e!s}" - raise YouTubeError(error_msg) from e + raise ValueError(error_msg) from e + finally: + if youtube: + youtube.close() def _get_channel_playlists(self, youtube: Any, channel_id: str) -> list[dict[str, Any]]: """Gets the public playlists for a channel.""" @@ -148,6 +144,7 @@ def _get_channel_playlists(self, youtube: Any, channel_id: str) -> list[dict[str def get_channel_info(self) -> DataFrame: """Retrieves channel information and returns it as a DataFrame.""" + youtube = None try: # Get channel ID and initialize YouTube API client channel_id = self._extract_channel_id(self.channel_url) @@ -224,3 +221,6 @@ def get_channel_info(self) -> DataFrame: except (HttpError, HTTPError, Exception) as e: return DataFrame(pd.DataFrame({"error": [str(e)]})) + finally: + if youtube: + youtube.close() diff --git a/src/backend/base/langflow/components/youtube/comments.py b/src/backend/base/langflow/components/youtube/comments.py index 415a20495cd6..0976326ad119 100644 --- a/src/backend/base/langflow/components/youtube/comments.py +++ b/src/backend/base/langflow/components/youtube/comments.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager + import pandas as pd from googleapiclient.discovery import build from googleapiclient.errors import HttpError @@ -8,14 +10,6 @@ from langflow.template import Output -class YouTubeError(Exception): - """Base exception class for YouTube-related errors.""" - - -class YouTubeAPIError(YouTubeError): - """Exception raised for YouTube API-related errors.""" - - class YouTubeCommentsComponent(Component): """A component that retrieves comments from YouTube videos.""" @@ -145,80 +139,86 @@ def _process_comment( return processed_comments + @contextmanager + def youtube_client(self): + """Context manager for YouTube API client.""" + client = build("youtube", "v3", developerKey=self.api_key) + try: + yield client + finally: + client.close() + def get_video_comments(self) -> DataFrame: """Retrieves comments from a YouTube video and returns as DataFrame.""" try: # Extract video ID from URL video_id = self._extract_video_id(self.video_url) - # Initialize YouTube API client - youtube = build("youtube", "v3", developerKey=self.api_key) - - # Prepare the initial request - request = youtube.commentThreads().list( - part="snippet,replies", - videoId=video_id, - maxResults=min(self.API_MAX_RESULTS, self.max_results), - order=self.sort_by, - textFormat="plainText", - ) - - comments_data = [] - results_count = 0 - - while request and results_count < self.max_results: - response = request.execute() - - for item in response.get("items", []): - if results_count >= self.max_results: - break - - comments = self._process_comment( - item, include_metrics=self.include_metrics, include_replies=self.include_replies - ) - comments_data.extend(comments) - results_count += 1 - - # Get the next page if available and needed - if "nextPageToken" in response and results_count < self.max_results: - request = youtube.commentThreads().list( - part="snippet,replies", - videoId=video_id, - maxResults=min(self.API_MAX_RESULTS, self.max_results - results_count), - order=self.sort_by, - textFormat="plainText", - pageToken=response["nextPageToken"], - ) - else: - request = None - - # Convert to DataFrame - comments_df = pd.DataFrame(comments_data) - - # Add video metadata - comments_df["video_id"] = video_id - comments_df["video_url"] = self.video_url - - # Sort columns for better organization - column_order = [ - "video_id", - "video_url", - "comment_id", - "parent_comment_id", - "is_reply", - "author", - "author_channel_url", - "text", - "published_at", - "updated_at", - ] - - if self.include_metrics: - column_order.extend(["like_count", "reply_count"]) - - comments_df = comments_df[column_order] - - return DataFrame(comments_df) + # Use context manager for YouTube API client + with self.youtube_client() as youtube: + comments_data = [] + results_count = 0 + request = youtube.commentThreads().list( + part="snippet,replies", + videoId=video_id, + maxResults=min(self.API_MAX_RESULTS, self.max_results), + order=self.sort_by, + textFormat="plainText", + ) + + while request and results_count < self.max_results: + response = request.execute() + + for item in response.get("items", []): + if results_count >= self.max_results: + break + + comments = self._process_comment( + item, include_metrics=self.include_metrics, include_replies=self.include_replies + ) + comments_data.extend(comments) + results_count += 1 + + # Get the next page if available and needed + if "nextPageToken" in response and results_count < self.max_results: + request = youtube.commentThreads().list( + part="snippet,replies", + videoId=video_id, + maxResults=min(self.API_MAX_RESULTS, self.max_results - results_count), + order=self.sort_by, + textFormat="plainText", + pageToken=response["nextPageToken"], + ) + else: + request = None + + # Convert to DataFrame + comments_df = pd.DataFrame(comments_data) + + # Add video metadata + comments_df["video_id"] = video_id + comments_df["video_url"] = self.video_url + + # Sort columns for better organization + column_order = [ + "video_id", + "video_url", + "comment_id", + "parent_comment_id", + "is_reply", + "author", + "author_channel_url", + "text", + "published_at", + "updated_at", + ] + + if self.include_metrics: + column_order.extend(["like_count", "reply_count"]) + + comments_df = comments_df[column_order] + + return DataFrame(comments_df) except HttpError as e: error_message = f"YouTube API error: {e!s}" diff --git a/src/backend/base/langflow/components/youtube/search.py b/src/backend/base/langflow/components/youtube/search.py index 89ebbc7fd6b1..7d793ed2d152 100644 --- a/src/backend/base/langflow/components/youtube/search.py +++ b/src/backend/base/langflow/components/youtube/search.py @@ -1,117 +1,119 @@ +from contextlib import contextmanager + import pandas as pd from googleapiclient.discovery import build from googleapiclient.errors import HttpError from langflow.custom import Component -from langflow.inputs import IntInput, MessageTextInput, SecretStrInput +from langflow.inputs import BoolInput, DropdownInput, IntInput, MessageTextInput, SecretStrInput from langflow.schema import DataFrame from langflow.template import Output class YouTubeSearchComponent(Component): - """A component that searches YouTube and returns a list of video data.""" + """A component that searches YouTube videos.""" display_name: str = "YouTube Search" - description: str = "Searches YouTube and returns a list of video data based on a query." + description: str = "Searches YouTube videos based on query." icon: str = "YouTube" inputs = [ MessageTextInput( name="query", display_name="Search Query", - info="Enter the search query for YouTube videos.", + info="The search query to look for on YouTube.", tool_mode=True, ), - IntInput( - name="max_results", - display_name="Max Results", - value=5, - info="The maximum number of video results to return.", - ), SecretStrInput( name="api_key", display_name="YouTube API Key", info="Your YouTube Data API key.", + required=True, + ), + IntInput( + name="max_results", + display_name="Max Results", + value=10, + info="The maximum number of results to return.", + ), + DropdownInput( + name="order", + display_name="Sort Order", + options=["relevance", "date", "rating", "title", "viewCount"], + value="relevance", + info="Sort order for the search results.", + ), + BoolInput( + name="include_metadata", + display_name="Include Metadata", + value=True, + info="Include video metadata like description and statistics.", + advanced=True, ), ] outputs = [ - Output(name="video_data", display_name="Video Data", method="search_youtube"), + Output(name="results", display_name="Search Results", method="search_videos"), ] - def search_youtube(self) -> DataFrame: - """Searches YouTube and returns video data as a DataFrame.""" + @contextmanager + def youtube_client(self): + """Context manager for YouTube API client.""" + client = build("youtube", "v3", developerKey=self.api_key) try: - # Initialize YouTube API client - youtube = build("youtube", "v3", developerKey=self.api_key) - - # Perform initial search - search_response = ( - youtube.search() - .list(q=self.query, type="video", part="id,snippet", maxResults=self.max_results) - .execute() - ) - - # Prepare data for DataFrame - video_data_list = [] - for search_result in search_response.get("items", []): - video_id = search_result["id"]["videoId"] - snippet = search_result["snippet"] - - video_data = { - "video_id": video_id, - "url": f"https://www.youtube.com/watch?v={video_id}", - "title": snippet["title"], - "description": snippet["description"], - "channel_id": snippet["channelId"], - "channel_title": snippet["channelTitle"], - "published_at": snippet["publishedAt"], - "search_query": self.query, - } - - # Add thumbnails - thumbnails = snippet["thumbnails"] - for size, thumb in thumbnails.items(): - video_data[f"thumbnail_{size}_url"] = thumb["url"] - video_data[f"thumbnail_{size}_width"] = thumb.get("width", 0) - video_data[f"thumbnail_{size}_height"] = thumb.get("height", 0) - - video_data_list.append(video_data) - - if not video_data_list: - return DataFrame(pd.DataFrame({"error": ["No results found"]})) - - # Create DataFrame - video_df = pd.DataFrame(video_data_list) - - # Organize columns in logical groups - base_cols = [ - "video_id", - "title", - "url", - "channel_id", - "channel_title", - "published_at", - "search_query", - "description", - ] - - thumb_cols = sorted([col for col in video_df.columns if col.startswith("thumbnail_")]) - - # Get remaining columns that don't fit in any category - all_defined_cols = base_cols + thumb_cols - other_cols = [col for col in video_df.columns if col not in all_defined_cols] - - # Combine all columns in desired order - ordered_cols = base_cols + thumb_cols + other_cols - - # Reorder DataFrame columns - video_df = video_df[ordered_cols] - - return DataFrame(video_df) + yield client + finally: + client.close() - except HttpError as e: - return DataFrame(pd.DataFrame({"error": [f"An HTTP error occurred: {e!s}"]})) + def search_videos(self) -> DataFrame: + """Searches YouTube videos and returns results as DataFrame.""" + try: + with self.youtube_client() as youtube: + search_response = ( + youtube.search() + .list( + q=self.query, + part="id,snippet", + maxResults=self.max_results, + order=self.order, + type="video", + ) + .execute() + ) + + results = [] + for search_result in search_response.get("items", []): + video_id = search_result["id"]["videoId"] + snippet = search_result["snippet"] + + result = { + "video_id": video_id, + "title": snippet["title"], + "description": snippet["description"], + "published_at": snippet["publishedAt"], + "channel_title": snippet["channelTitle"], + "thumbnail_url": snippet["thumbnails"]["default"]["url"], + } + + if self.include_metadata: + # Get video details for additional metadata + video_response = youtube.videos().list(part="statistics,contentDetails", id=video_id).execute() + + if video_response.get("items"): + video_details = video_response["items"][0] + result.update( + { + "view_count": int(video_details["statistics"]["viewCount"]), + "like_count": int(video_details["statistics"].get("likeCount", 0)), + "comment_count": int(video_details["statistics"].get("commentCount", 0)), + "duration": video_details["contentDetails"]["duration"], + } + ) + + results.append(result) + + return DataFrame(pd.DataFrame(results)) - except (KeyError, pd.errors.EmptyDataError) as e: - return DataFrame(pd.DataFrame({"error": [f"An unexpected error occurred: {e!s}"]})) + except HttpError as e: + error_message = f"YouTube API error: {e!s}" + return DataFrame(pd.DataFrame({"error": [error_message]})) diff --git a/src/backend/base/langflow/components/youtube/trending.py b/src/backend/base/langflow/components/youtube/trending.py index 986991e8a948..85ad669771b5 100644 --- a/src/backend/base/langflow/components/youtube/trending.py +++ b/src/backend/base/langflow/components/youtube/trending.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager + import pandas as pd from googleapiclient.discovery import build from googleapiclient.errors import HttpError @@ -145,6 +147,15 @@ def _format_duration(self, duration: str) -> str: return f"{hours:02d}:{minutes:02d}:{seconds:02d}" return f"{minutes:02d}:{seconds:02d}" + @contextmanager + def youtube_client(self): + """Context manager for YouTube API client.""" + client = build("youtube", "v3", developerKey=self.api_key) + try: + yield client + finally: + client.close() + def get_trending_videos(self) -> DataFrame: """Retrieves trending videos from YouTube and returns as DataFrame.""" try: @@ -152,113 +163,112 @@ def get_trending_videos(self) -> DataFrame: if not 1 <= self.max_results <= MAX_API_RESULTS: self.max_results = min(max(1, self.max_results), MAX_API_RESULTS) - # Build YouTube API client - youtube = build("youtube", "v3", developerKey=self.api_key) - - # Get country code - region_code = self.COUNTRY_CODES[self.region] - - # Prepare API request parts - parts = ["snippet"] - if self.include_statistics: - parts.append("statistics") - if self.include_content_details: - parts.append("contentDetails") - - # Prepare API request parameters - request_params = { - "part": ",".join(parts), - "chart": "mostPopular", - "regionCode": region_code, - "maxResults": self.max_results, - } - - # Add category filter if not "All" - if self.category != "All": - request_params["videoCategoryId"] = self.VIDEO_CATEGORIES[self.category] - - # Get trending videos - request = youtube.videos().list(**request_params) - response = request.execute() - - videos_data = [] - for item in response.get("items", []): - video_data = { - "video_id": item["id"], - "title": item["snippet"]["title"], - "description": item["snippet"]["description"], - "channel_id": item["snippet"]["channelId"], - "channel_title": item["snippet"]["channelTitle"], - "published_at": item["snippet"]["publishedAt"], - "url": f"https://www.youtube.com/watch?v={item['id']}", - "region": self.region, - "category": self.category, + # Use context manager for YouTube API client + with self.youtube_client() as youtube: + # Get country code + region_code = self.COUNTRY_CODES[self.region] + + # Prepare API request parts + parts = ["snippet"] + if self.include_statistics: + parts.append("statistics") + if self.include_content_details: + parts.append("contentDetails") + + # Prepare API request parameters + request_params = { + "part": ",".join(parts), + "chart": "mostPopular", + "regionCode": region_code, + "maxResults": self.max_results, } - # Add thumbnails if requested + # Add category filter if not "All" + if self.category != "All": + request_params["videoCategoryId"] = self.VIDEO_CATEGORIES[self.category] + + # Get trending videos + request = youtube.videos().list(**request_params) + response = request.execute() + + videos_data = [] + for item in response.get("items", []): + video_data = { + "video_id": item["id"], + "title": item["snippet"]["title"], + "description": item["snippet"]["description"], + "channel_id": item["snippet"]["channelId"], + "channel_title": item["snippet"]["channelTitle"], + "published_at": item["snippet"]["publishedAt"], + "url": f"https://www.youtube.com/watch?v={item['id']}", + "region": self.region, + "category": self.category, + } + + # Add thumbnails if requested + if self.include_thumbnails: + for size, thumb in item["snippet"]["thumbnails"].items(): + video_data[f"thumbnail_{size}_url"] = thumb["url"] + video_data[f"thumbnail_{size}_width"] = thumb.get("width", 0) + video_data[f"thumbnail_{size}_height"] = thumb.get("height", 0) + + # Add statistics if requested + if self.include_statistics and "statistics" in item: + video_data.update( + { + "view_count": int(item["statistics"].get("viewCount", 0)), + "like_count": int(item["statistics"].get("likeCount", 0)), + "comment_count": int(item["statistics"].get("commentCount", 0)), + } + ) + + # Add content details if requested + if self.include_content_details and "contentDetails" in item: + content_details = item["contentDetails"] + video_data.update( + { + "duration": self._format_duration(content_details["duration"]), + "definition": content_details.get("definition", "hd").upper(), + "has_captions": content_details.get("caption", "false") == "true", + "licensed_content": content_details.get("licensedContent", False), + "projection": content_details.get("projection", "rectangular"), + } + ) + + videos_data.append(video_data) + + # Convert to DataFrame + videos_df = pd.DataFrame(videos_data) + + # Organize columns + column_order = [ + "video_id", + "title", + "channel_id", + "channel_title", + "category", + "region", + "published_at", + "url", + "description", + ] + + if self.include_statistics: + column_order.extend(["view_count", "like_count", "comment_count"]) + + if self.include_content_details: + column_order.extend(["duration", "definition", "has_captions", "licensed_content", "projection"]) + + # Add thumbnail columns at the end if included if self.include_thumbnails: - for size, thumb in item["snippet"]["thumbnails"].items(): - video_data[f"thumbnail_{size}_url"] = thumb["url"] - video_data[f"thumbnail_{size}_width"] = thumb.get("width", 0) - video_data[f"thumbnail_{size}_height"] = thumb.get("height", 0) - - # Add statistics if requested - if self.include_statistics and "statistics" in item: - video_data.update( - { - "view_count": int(item["statistics"].get("viewCount", 0)), - "like_count": int(item["statistics"].get("likeCount", 0)), - "comment_count": int(item["statistics"].get("commentCount", 0)), - } - ) - - # Add content details if requested - if self.include_content_details and "contentDetails" in item: - content_details = item["contentDetails"] - video_data.update( - { - "duration": self._format_duration(content_details["duration"]), - "definition": content_details.get("definition", "hd").upper(), - "has_captions": content_details.get("caption", "false") == "true", - "licensed_content": content_details.get("licensedContent", False), - "projection": content_details.get("projection", "rectangular"), - } - ) - - videos_data.append(video_data) - - # Convert to DataFrame - videos_df = pd.DataFrame(videos_data) - - # Organize columns - column_order = [ - "video_id", - "title", - "channel_id", - "channel_title", - "category", - "region", - "published_at", - "url", - "description", - ] - - if self.include_statistics: - column_order.extend(["view_count", "like_count", "comment_count"]) - - if self.include_content_details: - column_order.extend(["duration", "definition", "has_captions", "licensed_content", "projection"]) - - # Add thumbnail columns at the end if included - if self.include_thumbnails: - thumbnail_cols = [col for col in videos_df.columns if col.startswith("thumbnail_")] - column_order.extend(sorted(thumbnail_cols)) - - # Reorder columns, including any that might not be in column_order - remaining_cols = [col for col in videos_df.columns if col not in column_order] - videos_df = videos_df[column_order + remaining_cols] - - return DataFrame(videos_df) + thumbnail_cols = [col for col in videos_df.columns if col.startswith("thumbnail_")] + column_order.extend(sorted(thumbnail_cols)) + + # Reorder columns, including any that might not be in column_order + remaining_cols = [col for col in videos_df.columns if col not in column_order] + videos_df = videos_df[column_order + remaining_cols] + + return DataFrame(videos_df) except HttpError as e: error_message = f"YouTube API error: {e}" diff --git a/src/backend/base/langflow/components/youtube/video_details.py b/src/backend/base/langflow/components/youtube/video_details.py index 85bde4239df4..fa23064af45c 100644 --- a/src/backend/base/langflow/components/youtube/video_details.py +++ b/src/backend/base/langflow/components/youtube/video_details.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager + import googleapiclient import pandas as pd from googleapiclient.discovery import build @@ -65,6 +67,15 @@ class YouTubeVideoDetailsComponent(Component): API_FORBIDDEN = 403 VIDEO_NOT_FOUND = 404 + @contextmanager + def youtube_client(self): + """Context manager for YouTube API client.""" + client = build("youtube", "v3", developerKey=self.api_key) + try: + yield client + finally: + client.close() + def _extract_video_id(self, video_url: str) -> str: """Extracts the video ID from a YouTube URL.""" import re @@ -107,136 +118,136 @@ def _format_duration(self, duration: str) -> str: def get_video_details(self) -> DataFrame: """Retrieves detailed information about a YouTube video and returns as DataFrame.""" try: - # Extract video ID and initialize API - video_id = self._extract_video_id(self.video_url) - youtube = build("youtube", "v3", developerKey=self.api_key) - - # Prepare parts for the API request - parts = ["snippet"] - if self.include_statistics: - parts.append("statistics") - if self.include_content_details: - parts.append("contentDetails") - - # Get video information - video_response = youtube.videos().list(part=",".join(parts), id=video_id).execute() - - if not video_response["items"]: - return DataFrame(pd.DataFrame({"error": ["Video not found"]})) - - video_info = video_response["items"][0] - snippet = video_info["snippet"] - - # Build video data dictionary - video_data = { - "video_id": [video_id], - "url": [f"https://www.youtube.com/watch?v={video_id}"], - "title": [snippet["title"]], - "description": [snippet["description"]], - "published_at": [snippet["publishedAt"]], - "channel_id": [snippet["channelId"]], - "channel_title": [snippet["channelTitle"]], - "category_id": [snippet.get("categoryId", "Unknown")], - "live_broadcast_content": [snippet.get("liveBroadcastContent", "none")], - } - - # Add thumbnails if requested - if self.include_thumbnails: - for size, thumb in snippet["thumbnails"].items(): - video_data[f"thumbnail_{size}_url"] = [thumb["url"]] - video_data[f"thumbnail_{size}_width"] = [thumb.get("width", 0)] - video_data[f"thumbnail_{size}_height"] = [thumb.get("height", 0)] - - # Add tags if requested - if self.include_tags and "tags" in snippet: - video_data["tags"] = [", ".join(snippet["tags"])] - video_data["tags_count"] = [len(snippet["tags"])] - - # Add statistics if requested - if self.include_statistics and "statistics" in video_info: - stats = video_info["statistics"] - video_data.update( - { - "view_count": [int(stats.get("viewCount", 0))], - "like_count": [int(stats.get("likeCount", 0))], - "favorite_count": [int(stats.get("favoriteCount", 0))], - "comment_count": [int(stats.get("commentCount", 0))], - } - ) - - # Add content details if requested - if self.include_content_details and "contentDetails" in video_info: - content_details = video_info["contentDetails"] - video_data.update( - { - "duration": [self._format_duration(content_details["duration"])], - "dimension": [content_details.get("dimension", "2d")], - "definition": [content_details.get("definition", "hd").upper()], - "has_captions": [content_details.get("caption", "false") == "true"], - "licensed_content": [content_details.get("licensedContent", False)], - "projection": [content_details.get("projection", "rectangular")], - "has_custom_thumbnails": [content_details.get("hasCustomThumbnail", False)], - } - ) - - # Add content rating if available - if "contentRating" in content_details: - rating_info = content_details["contentRating"] - video_data["content_rating"] = [str(rating_info)] - - # Create DataFrame with organized columns - video_df = pd.DataFrame(video_data) - - # Organize columns in logical groups - basic_cols = [ - "video_id", - "title", - "url", - "channel_id", - "channel_title", - "published_at", - "category_id", - "live_broadcast_content", - "description", - ] - - stat_cols = ["view_count", "like_count", "favorite_count", "comment_count"] - - content_cols = [ - "duration", - "dimension", - "definition", - "has_captions", - "licensed_content", - "projection", - "has_custom_thumbnails", - "content_rating", - ] - - tag_cols = ["tags", "tags_count"] - - thumb_cols = [col for col in video_df.columns if col.startswith("thumbnail_")] - - # Reorder columns based on what's included - ordered_cols = basic_cols[:] - - if self.include_statistics: - ordered_cols.extend([col for col in stat_cols if col in video_df.columns]) - - if self.include_content_details: - ordered_cols.extend([col for col in content_cols if col in video_df.columns]) - - if self.include_tags: - ordered_cols.extend([col for col in tag_cols if col in video_df.columns]) - - if self.include_thumbnails: - ordered_cols.extend(sorted(thumb_cols)) - - # Add any remaining columns - remaining_cols = [col for col in video_df.columns if col not in ordered_cols] - ordered_cols.extend(remaining_cols) - - return DataFrame(video_df[ordered_cols]) + with self.youtube_client() as youtube: + # Extract video ID + video_id = self._extract_video_id(self.video_url) + + # Prepare parts for the API request + parts = ["snippet"] + if self.include_statistics: + parts.append("statistics") + if self.include_content_details: + parts.append("contentDetails") + + # Get video information + video_response = youtube.videos().list(part=",".join(parts), id=video_id).execute() + + if not video_response["items"]: + return DataFrame(pd.DataFrame({"error": ["Video not found"]})) + + video_info = video_response["items"][0] + snippet = video_info["snippet"] + + # Build video data dictionary + video_data = { + "video_id": [video_id], + "url": [f"https://www.youtube.com/watch?v={video_id}"], + "title": [snippet["title"]], + "description": [snippet["description"]], + "published_at": [snippet["publishedAt"]], + "channel_id": [snippet["channelId"]], + "channel_title": [snippet["channelTitle"]], + "category_id": [snippet.get("categoryId", "Unknown")], + "live_broadcast_content": [snippet.get("liveBroadcastContent", "none")], + } + + # Add thumbnails if requested + if self.include_thumbnails: + for size, thumb in snippet["thumbnails"].items(): + video_data[f"thumbnail_{size}_url"] = [thumb["url"]] + video_data[f"thumbnail_{size}_width"] = [thumb.get("width", 0)] + video_data[f"thumbnail_{size}_height"] = [thumb.get("height", 0)] + + # Add tags if requested + if self.include_tags and "tags" in snippet: + video_data["tags"] = [", ".join(snippet["tags"])] + video_data["tags_count"] = [len(snippet["tags"])] + + # Add statistics if requested + if self.include_statistics and "statistics" in video_info: + stats = video_info["statistics"] + video_data.update( + { + "view_count": [int(stats.get("viewCount", 0))], + "like_count": [int(stats.get("likeCount", 0))], + "favorite_count": [int(stats.get("favoriteCount", 0))], + "comment_count": [int(stats.get("commentCount", 0))], + } + ) + + # Add content details if requested + if self.include_content_details and "contentDetails" in video_info: + content_details = video_info["contentDetails"] + video_data.update( + { + "duration": [self._format_duration(content_details["duration"])], + "dimension": [content_details.get("dimension", "2d")], + "definition": [content_details.get("definition", "hd").upper()], + "has_captions": [content_details.get("caption", "false") == "true"], + "licensed_content": [content_details.get("licensedContent", False)], + "projection": [content_details.get("projection", "rectangular")], + "has_custom_thumbnails": [content_details.get("hasCustomThumbnail", False)], + } + ) + + # Add content rating if available + if "contentRating" in content_details: + rating_info = content_details["contentRating"] + video_data["content_rating"] = [str(rating_info)] + + # Create DataFrame with organized columns + video_df = pd.DataFrame(video_data) + + # Organize columns in logical groups + basic_cols = [ + "video_id", + "title", + "url", + "channel_id", + "channel_title", + "published_at", + "category_id", + "live_broadcast_content", + "description", + ] + + stat_cols = ["view_count", "like_count", "favorite_count", "comment_count"] + + content_cols = [ + "duration", + "dimension", + "definition", + "has_captions", + "licensed_content", + "projection", + "has_custom_thumbnails", + "content_rating", + ] + + tag_cols = ["tags", "tags_count"] + + thumb_cols = [col for col in video_df.columns if col.startswith("thumbnail_")] + + # Reorder columns based on what's included + ordered_cols = basic_cols[:] + + if self.include_statistics: + ordered_cols.extend([col for col in stat_cols if col in video_df.columns]) + + if self.include_content_details: + ordered_cols.extend([col for col in content_cols if col in video_df.columns]) + + if self.include_tags: + ordered_cols.extend([col for col in tag_cols if col in video_df.columns]) + + if self.include_thumbnails: + ordered_cols.extend(sorted(thumb_cols)) + + # Add any remaining columns + remaining_cols = [col for col in video_df.columns if col not in ordered_cols] + ordered_cols.extend(remaining_cols) + + return DataFrame(video_df[ordered_cols]) except (HttpError, googleapiclient.errors.HttpError) as e: error_message = f"YouTube API error: {e!s}" From a95d24820cece38d1b95bc62443e222a21163217 Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Thu, 16 Jan 2025 14:30:56 -0300 Subject: [PATCH 06/16] feat(components): refactor YouTube Transcripts component - Split output into dedicated Chunks (DataFrame) and Transcript (Message) formats - Remove redundant language parameter and transcript format selection - Improve chunk timestamp handling and data structure - Simplify interface by making each output specialized --- .../components/youtube/youtube_transcripts.py | 226 +++--------------- 1 file changed, 34 insertions(+), 192 deletions(-) diff --git a/src/backend/base/langflow/components/youtube/youtube_transcripts.py b/src/backend/base/langflow/components/youtube/youtube_transcripts.py index 57fb44b80b97..2979ed88869d 100644 --- a/src/backend/base/langflow/components/youtube/youtube_transcripts.py +++ b/src/backend/base/langflow/components/youtube/youtube_transcripts.py @@ -1,9 +1,11 @@ +import pandas as pd +import youtube_transcript_api from langchain_community.document_loaders import YoutubeLoader from langchain_community.document_loaders.youtube import TranscriptFormat from langflow.custom import Component from langflow.inputs import DropdownInput, IntInput, MultilineInput -from langflow.schema import Message +from langflow.schema import DataFrame, Message from langflow.template import Output @@ -11,8 +13,9 @@ class YouTubeTranscriptsComponent(Component): """A component that extracts spoken content from YouTube videos as transcripts.""" display_name: str = "YouTube Transcripts" - description: str = "Extracts spoken content from YouTube videos as transcripts." + description: str = "Extracts spoken content from YouTube videos with both DataFrame and text output options." icon: str = "YouTube" + name = "YouTubeTranscripts" inputs = [ MultilineInput( @@ -21,164 +24,11 @@ class YouTubeTranscriptsComponent(Component): info="Enter the YouTube video URL to get transcripts from.", tool_mode=True, ), - DropdownInput( - name="transcript_format", - display_name="Transcript Format", - options=["text", "chunks"], - value="text", - info="The format of the transcripts. Either 'text' for a single output or 'chunks' for timestamped chunks.", - advanced=True, - ), IntInput( name="chunk_size_seconds", display_name="Chunk Size (seconds)", value=60, - advanced=True, - info="The size of each transcript chunk in seconds. Only applicable when " - "'Transcript Format' is set to 'chunks'.", - ), - DropdownInput( - name="language", - display_name="Language", - options=[ - "af", - "ak", - "sq", - "am", - "ar", - "hy", - "as", - "ay", - "az", - "bn", - "eu", - "be", - "bho", - "bs", - "bg", - "my", - "ca", - "ceb", - "zh", - "zh-HK", - "zh-CN", - "zh-SG", - "zh-TW", - "zh-Hans", - "zh-Hant", - "hak-TW", - "nan-TW", - "co", - "hr", - "cs", - "da", - "dv", - "nl", - "en", - "en-US", - "eo", - "et", - "ee", - "fil", - "fi", - "fr", - "gl", - "lg", - "ka", - "de", - "el", - "gn", - "gu", - "ht", - "ha", - "haw", - "iw", - "hi", - "hmn", - "hu", - "is", - "ig", - "id", - "ga", - "it", - "ja", - "jv", - "kn", - "kk", - "km", - "rw", - "ko", - "kri", - "ku", - "ky", - "lo", - "la", - "lv", - "ln", - "lt", - "lb", - "mk", - "mg", - "ms", - "ml", - "mt", - "mi", - "mr", - "mn", - "ne", - "nso", - "no", - "ny", - "or", - "om", - "ps", - "fa", - "pl", - "pt", - "pa", - "qu", - "ro", - "ru", - "sm", - "sa", - "gd", - "sr", - "sn", - "sd", - "si", - "sk", - "sl", - "so", - "st", - "es", - "su", - "sw", - "sv", - "tg", - "ta", - "tt", - "te", - "th", - "ti", - "ts", - "tr", - "tk", - "uk", - "ur", - "ug", - "uz", - "vi", - "cy", - "fy", - "xh", - "yi", - "yo", - "zu", - ], - value="en", - info=( - "Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'" - ), + info="The size of each transcript chunk in seconds.", ), DropdownInput( name="translation", @@ -190,53 +40,45 @@ class YouTubeTranscriptsComponent(Component): ] outputs = [ - Output(name="transcripts", display_name="Transcription", method="build_youtube_transcripts"), + Output(name="dataframe", display_name="Chunks", method="get_dataframe_output"), + Output(name="message", display_name="Transcript", method="get_message_output"), ] - def build_youtube_transcripts(self) -> Message: - """Method to extracts transcripts from a YouTube video URL. + def _load_transcripts(self, *, as_chunks: bool = True): + """Internal method to load transcripts from YouTube.""" + loader = YoutubeLoader.from_youtube_url( + self.url, + transcript_format=TranscriptFormat.CHUNKS if as_chunks else TranscriptFormat.TEXT, + chunk_size_seconds=self.chunk_size_seconds, + translation=self.translation or None, + ) + return loader.load() - Returns: - Message: The transcripts of the video as a text string. If 'transcript_format' - is 'text', the transcripts are returned as a single continuous string. If - 'transcript_format' is 'chunks', the transcripts are returned as a string - with timestamped segments. - - Raises: - Exception: Returns an error message if transcript retrieval fails. - """ + def get_dataframe_output(self) -> DataFrame: + """Provides transcript output as a DataFrame with timestamp and text columns.""" try: - # Attempt to load transcripts in the specified language, fallback to any available language - languages = [self.language] if self.language else None - loader = YoutubeLoader.from_youtube_url( - self.url, - transcript_format=TranscriptFormat.TEXT - if self.transcript_format == "text" - else TranscriptFormat.CHUNKS, - chunk_size_seconds=self.chunk_size_seconds, - language=languages, - translation=self.translation or None, - ) - - transcripts = loader.load() - - if self.transcript_format == "text": - # Extract only the page_content from the Document - result = transcripts[0].page_content - return Message(text=result) + transcripts = self._load_transcripts(as_chunks=True) - # For chunks, format the output with timestamps - formatted_chunks = [] + # Create DataFrame with timestamp and text columns + data = [] for doc in transcripts: start_seconds = int(doc.metadata["start_seconds"]) start_minutes = start_seconds // 60 start_seconds %= 60 timestamp = f"{start_minutes:02d}:{start_seconds:02d}" - formatted_chunks.append(f"{timestamp} {doc.page_content}") - result = "\n".join(formatted_chunks) + data.append({"timestamp": timestamp, "text": doc.page_content}) + return DataFrame(pd.DataFrame(data)) + + except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc: + return DataFrame(pd.DataFrame({"error": [f"Failed to get YouTube transcripts: {exc!s}"]})) + + def get_message_output(self) -> Message: + """Provides transcript output as continuous text.""" + try: + transcripts = self._load_transcripts(as_chunks=False) + result = transcripts[0].page_content return Message(text=result) - except Exception as exc: # noqa: BLE001 - # Using a specific error type for the return value + except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc: error_msg = f"Failed to get YouTube transcripts: {exc!s}" return Message(text=error_msg) From d629f696462252a3b26b123e290555a864303e6c Mon Sep 17 00:00:00 2001 From: Raphael Christi Date: Thu, 16 Jan 2025 14:47:43 -0300 Subject: [PATCH 07/16] fix(youtube): ensure all URL inputs are marked as required --- src/backend/base/langflow/components/youtube/channel.py | 1 + src/backend/base/langflow/components/youtube/comments.py | 1 + src/backend/base/langflow/components/youtube/playlist.py | 1 + src/backend/base/langflow/components/youtube/search.py | 1 + src/backend/base/langflow/components/youtube/video_details.py | 1 + .../base/langflow/components/youtube/youtube_transcripts.py | 1 + 6 files changed, 6 insertions(+) diff --git a/src/backend/base/langflow/components/youtube/channel.py b/src/backend/base/langflow/components/youtube/channel.py index 41ae07e04e9f..62a6e5bcd93a 100644 --- a/src/backend/base/langflow/components/youtube/channel.py +++ b/src/backend/base/langflow/components/youtube/channel.py @@ -30,6 +30,7 @@ class YouTubeChannelComponent(Component): display_name="Channel URL or ID", info="The URL or ID of the YouTube channel.", tool_mode=True, + required=True, ), SecretStrInput( name="api_key", diff --git a/src/backend/base/langflow/components/youtube/comments.py b/src/backend/base/langflow/components/youtube/comments.py index 0976326ad119..05fccce56b2f 100644 --- a/src/backend/base/langflow/components/youtube/comments.py +++ b/src/backend/base/langflow/components/youtube/comments.py @@ -28,6 +28,7 @@ class YouTubeCommentsComponent(Component): display_name="Video URL", info="The URL of the YouTube video to get comments from.", tool_mode=True, + required=True, ), SecretStrInput( name="api_key", diff --git a/src/backend/base/langflow/components/youtube/playlist.py b/src/backend/base/langflow/components/youtube/playlist.py index 604128cd16d6..d81d657eadc2 100644 --- a/src/backend/base/langflow/components/youtube/playlist.py +++ b/src/backend/base/langflow/components/youtube/playlist.py @@ -16,6 +16,7 @@ class YouTubePlaylistComponent(Component): name="playlist_url", display_name="Playlist URL", info="URL of the YouTube playlist.", + required=True, ), ] diff --git a/src/backend/base/langflow/components/youtube/search.py b/src/backend/base/langflow/components/youtube/search.py index 7d793ed2d152..1efdee7f0f0d 100644 --- a/src/backend/base/langflow/components/youtube/search.py +++ b/src/backend/base/langflow/components/youtube/search.py @@ -23,6 +23,7 @@ class YouTubeSearchComponent(Component): display_name="Search Query", info="The search query to look for on YouTube.", tool_mode=True, + required=True, ), SecretStrInput( name="api_key", diff --git a/src/backend/base/langflow/components/youtube/video_details.py b/src/backend/base/langflow/components/youtube/video_details.py index fa23064af45c..013d1d46514c 100644 --- a/src/backend/base/langflow/components/youtube/video_details.py +++ b/src/backend/base/langflow/components/youtube/video_details.py @@ -24,6 +24,7 @@ class YouTubeVideoDetailsComponent(Component): display_name="Video URL", info="The URL of the YouTube video.", tool_mode=True, + required=True, ), SecretStrInput( name="api_key", diff --git a/src/backend/base/langflow/components/youtube/youtube_transcripts.py b/src/backend/base/langflow/components/youtube/youtube_transcripts.py index 2979ed88869d..73eeb012a15b 100644 --- a/src/backend/base/langflow/components/youtube/youtube_transcripts.py +++ b/src/backend/base/langflow/components/youtube/youtube_transcripts.py @@ -23,6 +23,7 @@ class YouTubeTranscriptsComponent(Component): display_name="Video URL", info="Enter the YouTube video URL to get transcripts from.", tool_mode=True, + required=True, ), IntInput( name="chunk_size_seconds", From 21e71624dc4a444f146b6d6ccf0f6bc2402d3063 Mon Sep 17 00:00:00 2001 From: cristhianzl Date: Mon, 20 Jan 2025 11:21:31 -0300 Subject: [PATCH 08/16] fix youtube tests --- .../components/tools/youtube_transcripts.py | 244 ------------------ .../integrations/youtube-transcripts.spec.ts | 18 +- 2 files changed, 12 insertions(+), 250 deletions(-) delete mode 100644 src/backend/base/langflow/components/tools/youtube_transcripts.py diff --git a/src/backend/base/langflow/components/tools/youtube_transcripts.py b/src/backend/base/langflow/components/tools/youtube_transcripts.py deleted file mode 100644 index 19250194bd16..000000000000 --- a/src/backend/base/langflow/components/tools/youtube_transcripts.py +++ /dev/null @@ -1,244 +0,0 @@ -from langchain_community.document_loaders import YoutubeLoader -from langchain_community.document_loaders.youtube import TranscriptFormat - -from langflow.custom import Component -from langflow.inputs import DropdownInput, IntInput, MultilineInput -from langflow.schema import Message -from langflow.template import Output - - -class YouTubeTranscriptsComponent(Component): - """A component that extracts spoken content from YouTube videos as transcripts.""" - - display_name: str = "YouTube Transcripts" - description: str = "Extracts spoken content from YouTube videos as transcripts." - icon: str = "YouTube" - name = "YouTubeTranscripts" - - inputs = [ - MultilineInput( - name="url", - display_name="Video URL", - info="Enter the YouTube video URL to get transcripts from.", - tool_mode=True, - required=True, - ), - DropdownInput( - name="transcript_format", - display_name="Transcript Format", - options=["text", "chunks"], - value="text", - info="The format of the transcripts. Either 'text' for a single output or 'chunks' for timestamped chunks.", - advanced=True, - ), - IntInput( - name="chunk_size_seconds", - display_name="Chunk Size (seconds)", - value=60, - advanced=True, - info="The size of each transcript chunk in seconds. Only applicable when " - "'Transcript Format' is set to 'chunks'.", - ), - DropdownInput( - name="language", - display_name="Language", - options=[ - "af", - "ak", - "sq", - "am", - "ar", - "hy", - "as", - "ay", - "az", - "bn", - "eu", - "be", - "bho", - "bs", - "bg", - "my", - "ca", - "ceb", - "zh", - "zh-HK", - "zh-CN", - "zh-SG", - "zh-TW", - "zh-Hans", - "zh-Hant", - "hak-TW", - "nan-TW", - "co", - "hr", - "cs", - "da", - "dv", - "nl", - "en", - "en-US", - "eo", - "et", - "ee", - "fil", - "fi", - "fr", - "gl", - "lg", - "ka", - "de", - "el", - "gn", - "gu", - "ht", - "ha", - "haw", - "iw", - "hi", - "hmn", - "hu", - "is", - "ig", - "id", - "ga", - "it", - "ja", - "jv", - "kn", - "kk", - "km", - "rw", - "ko", - "kri", - "ku", - "ky", - "lo", - "la", - "lv", - "ln", - "lt", - "lb", - "mk", - "mg", - "ms", - "ml", - "mt", - "mi", - "mr", - "mn", - "ne", - "nso", - "no", - "ny", - "or", - "om", - "ps", - "fa", - "pl", - "pt", - "pa", - "qu", - "ro", - "ru", - "sm", - "sa", - "gd", - "sr", - "sn", - "sd", - "si", - "sk", - "sl", - "so", - "st", - "es", - "su", - "sw", - "sv", - "tg", - "ta", - "tt", - "te", - "th", - "ti", - "ts", - "tr", - "tk", - "uk", - "ur", - "ug", - "uz", - "vi", - "cy", - "fy", - "xh", - "yi", - "yo", - "zu", - ], - value="en", - info=( - "Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'" - ), - ), - DropdownInput( - name="translation", - display_name="Translation Language", - advanced=True, - options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"], - info="Translate the transcripts to the specified language. Leave empty for no translation.", - ), - ] - - outputs = [ - Output(name="transcripts", display_name="Transcription", method="build_youtube_transcripts"), - ] - - def build_youtube_transcripts(self) -> Message: - """Method to extracts transcripts from a YouTube video URL. - - Returns: - Message: The transcripts of the video as a text string. If 'transcript_format' - is 'text', the transcripts are returned as a single continuous string. If - 'transcript_format' is 'chunks', the transcripts are returned as a string - with timestamped segments. - - Raises: - Exception: Returns an error message if transcript retrieval fails. - """ - try: - # Attempt to load transcripts in the specified language, fallback to any available language - languages = [self.language] if self.language else None - loader = YoutubeLoader.from_youtube_url( - self.url, - transcript_format=TranscriptFormat.TEXT - if self.transcript_format == "text" - else TranscriptFormat.CHUNKS, - chunk_size_seconds=self.chunk_size_seconds, - language=languages, - translation=self.translation or None, - ) - - transcripts = loader.load() - - if self.transcript_format == "text": - # Extract only the page_content from the Document - result = transcripts[0].page_content - return Message(text=result) - - # For chunks, format the output with timestamps - formatted_chunks = [] - for doc in transcripts: - start_seconds = int(doc.metadata["start_seconds"]) - start_minutes = start_seconds // 60 - start_seconds %= 60 - timestamp = f"{start_minutes:02d}:{start_seconds:02d}" - formatted_chunks.append(f"{timestamp} {doc.page_content}") - result = "\n".join(formatted_chunks) - return Message(text=result) - - except Exception as exc: # noqa: BLE001 - # Using a specific error type for the return value - error_msg = f"Failed to get YouTube transcripts: {exc!s}" - return Message(text=error_msg) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 9fed71a6f79e..cc5480bc9bb7 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -11,15 +11,19 @@ test( await page.getByTestId("sidebar-search-input").click(); await page.getByTestId("sidebar-search-input").fill("youtube"); - await page.waitForSelector('[id="toolsYouTube Transcripts"]', { + await page.waitForSelector('[id="youtubeYouTube Transcripts"]', { timeout: 3000, }); await page - .locator('//*[@id="toolsYouTube Transcripts"]') - .dragTo(page.locator('//*[@id="react-flow-id"]')); - await page.mouse.up(); - await page.mouse.down(); + .getByTestId("youtubeYouTube Transcripts") + .hover() + .then(async () => { + await page + .getByTestId("add-component-button-youtube-transcripts") + .click(); + }); + await page.getByTestId("fit_view").click(); let outdatedComponents = await page @@ -35,11 +39,13 @@ test( .getByTestId("textarea_str_url") .fill("https://www.youtube.com/watch?v=VqhCQZaH4Vs"); + await page.getByTestId("fit_view").click(); + await page.getByTestId("button_run_youtube transcripts").click(); await page.waitForSelector("text=built successfully", { timeout: 30000 }); - await page.getByTestId("output-inspection-transcription").first().click(); + await page.getByTestId("output-inspection-transcript").first().click(); await page.waitForSelector("text=Component Output", { timeout: 30000 }); From 25b16305506c3e45e22dc67c2b4325716bc496cd Mon Sep 17 00:00:00 2001 From: cristhianzl Date: Mon, 20 Jan 2025 12:13:16 -0300 Subject: [PATCH 09/16] fix youtube tests --- src/backend/base/langflow/components/tools/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/backend/base/langflow/components/tools/__init__.py b/src/backend/base/langflow/components/tools/__init__.py index 8be4d88331f6..3255eaf333fe 100644 --- a/src/backend/base/langflow/components/tools/__init__.py +++ b/src/backend/base/langflow/components/tools/__init__.py @@ -28,7 +28,6 @@ from .wolfram_alpha_api import WolframAlphaAPIComponent from .yahoo import YfinanceComponent from .yahoo_finance import YfinanceToolComponent -from .youtube_transcripts import YouTubeTranscriptsComponent with warnings.catch_warnings(): warnings.simplefilter("ignore", LangChainDeprecationWarning) @@ -64,5 +63,4 @@ "WolframAlphaAPIComponent", "YfinanceComponent", "YfinanceToolComponent", - "YouTubeTranscriptsComponent", ] From 723774ae93c68f0aa0eb1609c36213c86055f447 Mon Sep 17 00:00:00 2001 From: cristhianzl Date: Mon, 20 Jan 2025 13:08:51 -0300 Subject: [PATCH 10/16] Refactor YouTube transcript integration test --- .../tests/extended/integrations/youtube-transcripts.spec.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index cc5480bc9bb7..8d3101c5c654 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -48,9 +48,7 @@ test( await page.getByTestId("output-inspection-transcript").first().click(); await page.waitForSelector("text=Component Output", { timeout: 30000 }); - await page.getByRole("gridcell").first().click(); - const value = await page.getByPlaceholder("Empty").inputValue(); expect(value.length).toBeGreaterThan(10); }, From b593f635d4c36231645c34f308542da42c142b45 Mon Sep 17 00:00:00 2001 From: cristhianzl Date: Mon, 20 Jan 2025 14:11:34 -0300 Subject: [PATCH 11/16] =?UTF-8?q?=E2=9C=85=20(youtube-transcripts.spec.ts)?= =?UTF-8?q?:=20remove=20unnecessary=20line=20of=20code=20to=20improve=20co?= =?UTF-8?q?de=20readability=20and=20maintainability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/extended/integrations/youtube-transcripts.spec.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 8d3101c5c654..695e5563b322 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -46,7 +46,6 @@ test( await page.waitForSelector("text=built successfully", { timeout: 30000 }); await page.getByTestId("output-inspection-transcript").first().click(); - await page.waitForSelector("text=Component Output", { timeout: 30000 }); await page.getByRole("gridcell").first().click(); const value = await page.getByPlaceholder("Empty").inputValue(); From c0dd889404c1bea117722f0e36534d0b1f778c5d Mon Sep 17 00:00:00 2001 From: cristhianzl Date: Mon, 20 Jan 2025 14:39:22 -0300 Subject: [PATCH 12/16] =?UTF-8?q?=E2=9C=85=20(youtube-transcripts.spec.ts)?= =?UTF-8?q?:=20add=20delays=20before=20certain=20actions=20to=20ensure=20p?= =?UTF-8?q?roper=20execution=20and=20avoid=20race=20conditions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/extended/integrations/youtube-transcripts.spec.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 695e5563b322..4df786c52168 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -41,10 +41,16 @@ test( await page.getByTestId("fit_view").click(); + await page.waitForTimeout(500); + await page.getByTestId("button_run_youtube transcripts").click(); + await page.waitForTimeout(500); + await page.waitForSelector("text=built successfully", { timeout: 30000 }); + await page.waitForTimeout(500); + await page.getByTestId("output-inspection-transcript").first().click(); await page.waitForSelector("text=Component Output", { timeout: 30000 }); await page.getByRole("gridcell").first().click(); From 34b797094e0ef06321755b0005faba69897a2c10 Mon Sep 17 00:00:00 2001 From: anovazzi1 Date: Mon, 20 Jan 2025 15:01:38 -0300 Subject: [PATCH 13/16] Refactor YouTube transcript integration test --- .../integrations/youtube-transcripts.spec.ts | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 4df786c52168..38cb62557126 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -11,18 +11,12 @@ test( await page.getByTestId("sidebar-search-input").click(); await page.getByTestId("sidebar-search-input").fill("youtube"); - await page.waitForSelector('[id="youtubeYouTube Transcripts"]', { - timeout: 3000, - }); - await page .getByTestId("youtubeYouTube Transcripts") - .hover() - .then(async () => { - await page - .getByTestId("add-component-button-youtube-transcripts") - .click(); - }); + .hover(); + await page + .getByTestId("add-component-button-youtube-transcripts") + .click(); await page.getByTestId("fit_view").click(); @@ -41,16 +35,10 @@ test( await page.getByTestId("fit_view").click(); - await page.waitForTimeout(500); - await page.getByTestId("button_run_youtube transcripts").click(); - await page.waitForTimeout(500); - await page.waitForSelector("text=built successfully", { timeout: 30000 }); - await page.waitForTimeout(500); - await page.getByTestId("output-inspection-transcript").first().click(); await page.waitForSelector("text=Component Output", { timeout: 30000 }); await page.getByRole("gridcell").first().click(); From 2783c1d72e22f395c9985c3e71ad43eb4cd25260 Mon Sep 17 00:00:00 2001 From: anovazzi1 Date: Mon, 20 Jan 2025 15:12:30 -0300 Subject: [PATCH 14/16] Refactor YouTube transcript integration test: Increase timeout for successful build check --- .../tests/extended/integrations/youtube-transcripts.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 38cb62557126..06877d6a379f 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -37,7 +37,7 @@ test( await page.getByTestId("button_run_youtube transcripts").click(); - await page.waitForSelector("text=built successfully", { timeout: 30000 }); + await page.waitForSelector("text=built successfully", { timeout: 300000 }); await page.getByTestId("output-inspection-transcript").first().click(); await page.waitForSelector("text=Component Output", { timeout: 30000 }); From 45eb389033d9941442b42a4b1bb020beed6c5c64 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 18:13:51 +0000 Subject: [PATCH 15/16] [autofix.ci] apply automated fixes --- .../extended/integrations/youtube-transcripts.spec.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index 06877d6a379f..cb07a1e3928b 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -11,12 +11,8 @@ test( await page.getByTestId("sidebar-search-input").click(); await page.getByTestId("sidebar-search-input").fill("youtube"); - await page - .getByTestId("youtubeYouTube Transcripts") - .hover(); - await page - .getByTestId("add-component-button-youtube-transcripts") - .click(); + await page.getByTestId("youtubeYouTube Transcripts").hover(); + await page.getByTestId("add-component-button-youtube-transcripts").click(); await page.getByTestId("fit_view").click(); From 79e49715e331cb485000f211057248c4850c9efc Mon Sep 17 00:00:00 2001 From: anovazzi1 Date: Mon, 20 Jan 2025 15:24:40 -0300 Subject: [PATCH 16/16] Refactor YouTube transcript integration test: Add delay before adding component --- .../tests/extended/integrations/youtube-transcripts.spec.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts index cb07a1e3928b..1ef0b5dd36fc 100644 --- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts +++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts @@ -11,6 +11,8 @@ test( await page.getByTestId("sidebar-search-input").click(); await page.getByTestId("sidebar-search-input").fill("youtube"); + await page.waitForTimeout(2000); + await page.getByTestId("youtubeYouTube Transcripts").hover(); await page.getByTestId("add-component-button-youtube-transcripts").click();