diff --git a/lyricsgenius/__main__.py b/lyricsgenius/__main__.py index 3e2d845a..205c9589 100644 --- a/lyricsgenius/__main__.py +++ b/lyricsgenius/__main__.py @@ -27,6 +27,8 @@ def main(args=None): help="Specify number of songs when searching for artist") parser.add_argument("-q", "--quiet", action="store_true", help="Turn off the API verbosity") + parser.add_argument("-n", "--num-workers", type=int, default=1, + help="Number of threads used to get songs") args = parser.parse_args() # Create an instance of the Genius class @@ -51,7 +53,8 @@ def main(args=None): elif args.search_type == "artist": artist = api.search_artist(args.terms[0], max_songs=args.max_songs, - sort='popularity') + sort='popularity', + num_workers=args.num_workers) if args.save: if not args.quiet: print("Saving '{a}'' lyrics...".format(a=safe_unicode(artist.name))) diff --git a/lyricsgenius/genius.py b/lyricsgenius/genius.py index e9368f5a..24a0cf3f 100644 --- a/lyricsgenius/genius.py +++ b/lyricsgenius/genius.py @@ -6,15 +6,17 @@ import json import os +import queue import re import shutil import time +import threading from bs4 import BeautifulSoup from .api import API, PublicAPI from .types import Album, Artist, Song, Track -from .utils import clean_str, safe_unicode +from .utils import SongThread, clean_str, safe_unicode class Genius(API, PublicAPI): @@ -268,7 +270,8 @@ def song_annotations(self, song_id, text_format=None): return all_annotations def search_album(self, name=None, artist="", - album_id=None, get_full_info=True, text_format=None): + album_id=None, get_full_info=True, + text_format=None, num_workers=1): """Searches for a specific album and gets its songs. You must pass either a :obj:`name` or an :obj:`album_id`. @@ -297,6 +300,16 @@ def search_album(self, name=None, artist="", print(album.name) """ + def download_track(track): + song_info = track['song'] + if (song_info['lyrics_state'] == 'complete' + and not song_info.get('instrumental')): + song_lyrics = self.lyrics(song_url=song_info['url']) + else: + song_lyrics = "" + + track = Track(self, track, song_lyrics) + tracks.append(track) msg = "You must pass either a `name` or an `album_id`." assert any([name, album_id]), msg @@ -326,7 +339,8 @@ def search_album(self, name=None, artist="", tracks = [] next_page = 1 - + errors_queue = queue.Queue() + thread_pool = [] # It's unlikely for an album to have >=50 songs, # but it's best to check while next_page: @@ -337,17 +351,37 @@ def search_album(self, name=None, artist="", text_format=text_format ) for track in tracks_list['tracks']: - song_info = track['song'] - if (song_info['lyrics_state'] == 'complete' - and not song_info.get('instrumental')): - song_lyrics = self.lyrics(song_url=song_info['url']) + if num_workers != 1: + thread = SongThread( + errors_queue, + name="Thread-Track-{}".format(track['song']['id']), + target=download_track, + args=(track,) + ) + thread.daemon = True + thread.start() + thread_pool.append(thread) + if len(thread_pool) == num_workers: + for thread in thread_pool: + thread.join() + thread_pool.clear() else: - song_lyrics = "" - - track = Track(self, track, song_lyrics) - tracks.append(track) + download_track(track) next_page = tracks_list['next_page'] + for thread in thread_pool: + thread.join() + try: + error = errors_queue.get(False) + except queue.Empty: + pass + else: + raise error + + length = len(tracks) + tracks.sort(key=lambda track: (track.number + if track.number is not None + else length)) if album_id is None and get_full_info is True: new_info = self.album(album_id, text_format=text_format)['album'] @@ -456,6 +490,7 @@ def search_artist(self, artist_name, max_songs=None, allow_name_change=True, artist_id=None, include_features=False, + num_workers=1, ): """Searches for a specific artist and gets their songs. @@ -518,6 +553,39 @@ def find_artist_id(search_term): # Assume the top search result is the intended artist return found_artist['id'] + def download_song(song_info, index): + # Check if song is valid (e.g. contains lyrics) + if self.skip_non_songs and not self._result_is_lyrics(song_info): + valid = False + else: + valid = True + + # Reject non-song results (e.g. Linear Notes, Tracklists, etc.) + if not valid: + if self.verbose: + s = song_info['title'] + print('"{s}" is not valid. Skipping.'.format( + s=safe_unicode(s))) + return + + # Create the Song object from lyrics and metadata + if song_info['lyrics_state'] == 'complete': + lyrics = self.lyrics(song_url=song_info['url']) + else: + lyrics = "" + if get_full_info: + new_info = self.song(song_info['id'])['song'] + song_info.update(new_info) + song = Song(self, song_info, lyrics) + song._index = index + + # Attempt to add the Song to the Artist + result = artist.add_song(song, verbose=False, + include_features=include_features) + if result is not None and self.verbose: + print('Song {n}: "{t}"'.format(n=len(artist.songs), + t=safe_unicode(song.title))) + # Get the artist ID (or use the one supplied) artist_id = artist_id if artist_id else find_artist_id(artist_name) if not artist_id: @@ -535,62 +603,64 @@ def find_artist_id(search_term): artist = Artist(self, artist_info) # Download each song by artist, stored as Song objects in Artist object page = 1 + num_songs = 0 reached_max_songs = True if max_songs == 0 else False + thread_pool = [] + errors_queue = queue.Queue() while not reached_max_songs: songs_on_page = self.artist_songs(artist_id=artist_id, per_page=per_page, page=page, sort=sort, ) - - # Loop through each song on page of search results - for song_info in songs_on_page['songs']: - # Check if song is valid (e.g. contains lyrics) - if self.skip_non_songs and not self._result_is_lyrics(song_info): - valid = False - else: - valid = True - - # Reject non-song results (e.g. Linear Notes, Tracklists, etc.) - if not valid: - if self.verbose: - s = song_info['title'] - print('"{s}" is not valid. Skipping.'.format( - s=safe_unicode(s))) - continue - - # Create the Song object from lyrics and metadata - if song_info['lyrics_state'] == 'complete': - lyrics = self.lyrics(song_url=song_info['url']) + for song in songs_on_page["songs"]: + if num_workers != 1: + thread = SongThread( + errors_queue, + name="Thread-Song-{}".format(song['id']), + target=download_song, + args=(song, num_songs) + ) + thread.daemon = True + thread.start() + thread_pool.append(thread) + if len(thread_pool) == num_workers: + for thread in thread_pool: + thread.join() + thread_pool.clear() else: - lyrics = "" - if get_full_info: - new_info = self.song(song_info['id'])['song'] - song_info.update(new_info) - song = Song(self, song_info, lyrics) - - # Attempt to add the Song to the Artist - result = artist.add_song(song, verbose=False, - include_features=include_features) - if result is not None and self.verbose: - print('Song {n}: "{t}"'.format(n=artist.num_songs, - t=safe_unicode(song.title))) + download_song(song, num_songs) + num_songs += 1 # Exit search if the max number of songs has been met - reached_max_songs = max_songs and artist.num_songs >= max_songs + reached_max_songs = max_songs and num_songs - 1 >= max_songs if reached_max_songs: - if self.verbose: - print(('\nReached user-specified song limit ({m}).' - .format(m=max_songs))) break + for thread in thread_pool: + thread.join() + try: + error = errors_queue.get(False) + except queue.Empty: + pass + else: + raise error + thread_pool.clear() + + if reached_max_songs: + if self.verbose: + print(('\nReached user-specified song limit ({m}).' + .format(m=max_songs))) + break + # Move on to next page of search results page = songs_on_page['next_page'] if page is None: break # Exit search when last page is reached + artist.songs.sort(key=lambda x: x._index) if self.verbose: - print('Done. Found {n} songs.'.format(n=artist.num_songs)) + print('Done. Found {n} songs.'.format(n=len(artist.songs))) return artist def save_artists(self, artists, filename="artist_lyrics", overwrite=False, diff --git a/lyricsgenius/types/album.py b/lyricsgenius/types/album.py index e3ea3fdf..812335e5 100644 --- a/lyricsgenius/types/album.py +++ b/lyricsgenius/types/album.py @@ -126,4 +126,7 @@ def save_lyrics(self, def __repr__(self): name = self.__class__.__name__ - return "{}(number, song)".format(name) + return "{name}({number}, Song(id={song_id}))".format( + name=name, + number=self.number, + song_id=self.song.id) diff --git a/lyricsgenius/types/artist.py b/lyricsgenius/types/artist.py index 0c4350da..f6687207 100644 --- a/lyricsgenius/types/artist.py +++ b/lyricsgenius/types/artist.py @@ -18,7 +18,6 @@ def __init__(self, client, json_dict): self._body = body self._client = client self.songs = [] - self.num_songs = len(self.songs) self.api_path = body['api_path'] self.header_image_url = body['header_image_url'] @@ -76,7 +75,6 @@ def add_song(self, new_song, verbose=True, include_features=False): if (new_song.artist == self.name or (include_features and any(new_song._body['featured_artists']))): self.songs.append(new_song) - self.num_songs += 1 return new_song if verbose: print("Can't add song by {b}, artist must be {a}.".format( @@ -149,6 +147,7 @@ def save_lyrics(self, def __str__(self): """Return a string representation of the Artist object.""" - msg = "{name}, {num} songs".format(name=self.name, num=self.num_songs) - msg = msg[:-1] if self.num_songs == 1 else msg + num_songs = len(self.songs) + msg = "{name}, {num} songs".format(name=self.name, num=num_songs) + msg = msg[:-1] if num_songs == 1 else msg return msg diff --git a/lyricsgenius/utils.py b/lyricsgenius/utils.py index a29995fe..5c23db5d 100644 --- a/lyricsgenius/utils.py +++ b/lyricsgenius/utils.py @@ -2,6 +2,7 @@ import re import os +import threading import sys import unicodedata from datetime import datetime @@ -9,6 +10,21 @@ from urllib.parse import parse_qs, urlparse +class SongThread(threading.Thread): + def __init__(self, errors_queue, **kwargs): + super().__init__(**kwargs) + self.errors_queue = errors_queue + + def run(self): + try: + if self._target: + self._target(*self._args, **self._kwargs) + except Exception as e: + self.errors_queue.put(e) + finally: + del self._target, self._args, self._kwargs + + def auth_from_environment(): """Gets credentials from environment variables. diff --git a/tests/test_artist.py b/tests/test_artist.py index 42c852b3..e1b8aa98 100644 --- a/tests/test_artist.py +++ b/tests/test_artist.py @@ -41,7 +41,7 @@ def test_name(self): def test_add_song_from_same_artist(self): msg = "The new song was not added to the artist object." self.artist.add_song(genius.search_song(self.new_song, self.artist_name)) - self.assertEqual(self.artist.num_songs, self.max_songs + 1, msg) + self.assertEqual(len(self.artist.songs), self.max_songs + 1, msg) def test_song(self): msg = "Song was not in artist's songs." @@ -51,7 +51,7 @@ def test_song(self): def test_add_song_from_different_artist(self): msg = "A song from a different artist was incorrectly allowed to be added." self.artist.add_song(genius.search_song("These Days", "Jackson Browne")) - self.assertEqual(self.artist.num_songs, self.max_songs, msg) + self.assertEqual(len(self.artist.songs), self.max_songs, msg) def test_artist_with_includes_features(self): # The artist did not get songs returned that they were featured in.