-
Notifications
You must be signed in to change notification settings - Fork 3
/
yt-history-to-csv.py
120 lines (100 loc) · 4.33 KB
/
yt-history-to-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
import csv
import os
import re
from bs4 import BeautifulSoup
from yt_dlp import YoutubeDL
from datetime import datetime
from colorama import Fore, Style
def get_metadata(url):
ydl_opts = {
'quiet': True,
'no_warnings': True,
'skip_download': True,
'forceurl': True,
'forcetitle': True,
'forcedescription': True,
'writeinfojson': True,
'simulate': True,
'youtube_include_dash_manifest': False
}
with YoutubeDL(ydl_opts) as ydl:
try:
meta = ydl.extract_info(url, download=False)
return meta
except Exception:
print(f"{Fore.RED}Failed to get metadata for {url}{Style.RESET_ALL}")
return None
def parse_html(input_file, output_file, resume=False):
with open(input_file, "r", encoding = "utf-8") as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
divs = soup.find_all('div', {'class': 'content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1'})
if resume:
last_processed_index = get_last_processed_index(output_file)
if last_processed_index is None:
print(f"{Fore.YELLOW}Unable to determine the last processed index. Resuming from the beginning.{Style.RESET_ALL}")
last_processed_index = 0
else:
last_processed_index += 1
divs = divs[last_processed_index:]
# Open CSV writer in append mode
with open(output_file, 'a', newline='') as file:
writer = csv.writer(file)
try:
for index, div in enumerate(divs, last_processed_index):
link = div.find('a', href=re.compile(r'https://www.youtube.com/watch\?v=.+'))
if link:
print(f"{Fore.GREEN}Parsing: {link['href']}{Style.RESET_ALL}")
meta = get_metadata(link['href'])
text_list = div.text.split('\n')
timestamp_text = None
for text in text_list:
if "WIB" in text:
timestamp_text = text.split("WIB")[0].strip().replace(',', '').replace('\xa0', ' ')
break
if timestamp_text:
dt_obj = datetime.strptime(timestamp_text, '%b %d %Y %I:%M:%S %p')
timestamp = dt_obj.strftime('%Y-%m-%d %H:%M:%S')
# Get the album, artist, and track details
artist = ""
track = ""
album = ""
duration = ""
if meta:
if 'artist' in meta:
artist = meta['artist']
if 'track' in meta:
track = meta['track']
if 'album' in meta:
album = meta['album']
if 'duration' in meta:
duration = str(meta['duration'])
# Skip the div if artist or track is empty
if not artist or not track:
print(f"{Fore.YELLOW}Skipping empty artist or track{Style.RESET_ALL}")
continue
row = [artist, track, album, timestamp, artist, duration]
writer.writerow(row)
print(row)
# Save the index of the last processed div
save_last_processed_index(output_file, index)
except KeyboardInterrupt:
print(f"{Fore.RED}Parsing interrupted by user.{Style.RESET_ALL}")
print(f"{Fore.GREEN}Parsing complete.{Style.RESET_ALL}")
def get_last_processed_index(output_file):
progress_file = f"{output_file}.progress"
if os.path.exists(progress_file):
with open(progress_file, 'r') as file:
last_index = file.read()
if last_index.isdigit():
return int(last_index)
return None
def save_last_processed_index(output_file, index):
progress_file = f"{output_file}.progress"
with open(progress_file, 'w') as file:
file.write(str(index))
def main(input_file, output_file):
parse_html(input_file, output_file, resume=True)
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])