-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathp3.py
83 lines (67 loc) · 3.14 KB
/
p3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import youtube_dl
import moviepy.editor as mp
import speech_recognition as sr
import json
def extract_video_data(video_url):
"""
Extracts metadata, transcript (optional), and timestamps (optional) from a YouTube video.
Args:
video_url: The URL of the YouTube video.
Returns:
A dictionary containing the extracted video data.
"""
ydl_opts = {
'format': 'best', # Download the best available quality
'quiet': True, # Suppress output
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(video_url, download=False) # Get metadata without downloading
video_data = {
"duration": info_dict.get('duration', 0), # Video duration in seconds
"image_id": None, # You'll need to handle image extraction separately if needed
"plays": info_dict.get('view_count', 0),
"created_at": info_dict.get('upload_date', ''),
"size": None, # You'll likely need to download the video to get its size
"transcodings": [], # We'll populate this later if transcribing
"title": info_dict.get('title', ''),
"updated_at": None, # YouTube doesn't provide this directly
"account_id": info_dict.get('channel_id', ''),
"progress": 1.0, # Assuming video is fully processed
"original_movie": {
"http_uri": None, # You'll need to download to get this
"size": None
},
"tags": info_dict.get('tags', []),
"id": info_dict.get('id', ''),
"aspect_ratio_multiplier": None, # You'll need to calculate this from video dimensions
"screenshot": {
"normal": info_dict.get('thumbnail', ''),
"thumb": None, # You might need to generate thumbnails yourself
"original": info_dict.get('thumbnail', '')
},
"state": "ready" # Assuming video is ready
}
# Optionally, extract transcript and timestamps
if EXTRACT_TRANSCRIPT:
video_data['transcodings'], video_data['size'] = extract_transcript_and_timestamps(video_url)
return video_data
def extract_transcript_and_timestamps(video_url):
"""
Extracts transcript and timestamps from a YouTube video using SpeechRecognition.
Args:
video_url: The URL of the YouTube video.
Returns:
A tuple containing:
- A list of transcoding dictionaries (potentially empty if transcription fails).
- The size of the downloaded video file in bytes (or None if download fails).
"""
# ... (Implementation for downloading video, transcribing audio, and generating timestamps)
return transcodings, video_size
# Main execution
if __name__ == "__main__":
video_url = "https://www.youtube.com/watch?v=hf--vE6dVOQ" # Replace with your video URL
EXTRACT_TRANSCRIPT = False # Set to True if you want to extract transcript and timestamps
video_data = extract_video_data(video_url)
# Save to JSON file
with open("video_data.json", "w") as f:
json.dump(video_data, f, indent=4)