Skip to content

Commit

Permalink
38 improve fuzzy matching (#103)
Browse files Browse the repository at this point in the history
* use fuzz.ratio instead of partial_ratio for fuzzy matching

add unit test for fuzzy matching
make fuzzy matching the only matching algo

* remove OBE test and commented out code

* update readme
  • Loading branch information
apastel authored Feb 8, 2025
1 parent 06414bf commit a68797a
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 69 deletions.
90 changes: 90 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,96 @@ def fixture_expected_dupe_groups() -> List[List[Dict]]:
]


@pytest.fixture(name="fuzzy_test_data")
def fixture_fuzzy_test_data() -> List[Dict]:
return [
{
"upload_artist": "The Offspring",
"upload_title": "The Offspring",
"expected_artist": "The Offspring",
"expected_title": "The Offspring",
},
{
"upload_artist": "Metallica",
"upload_title": "Metallica Through The Never: Music From The Motion Picture [Disc 2]",
"expected_artist": "Metallica",
"expected_title": "Metallica Through The Never (Music From The Motion Picture)",
},
{
"upload_artist": "Newsted",
"upload_title": "Heavy Metal Music",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "Between The Buried & Me",
"upload_title": "Colors Live",
"expected_artist": "Between the Buried and Me",
"expected_title": "Colors_Live",
},
{
"upload_artist": "Billy Joel",
"upload_title": " Greatest Hits, Vol. 3",
"expected_artist": "Billy Joel",
"expected_title": "Greatest Hits Vol. III",
},
{
"upload_artist": "Eminem",
"upload_title": "The Marshall Mathers LP 2",
"expected_artist": "Eminem",
"expected_title": "The Marshall Mathers LP2",
},
{
"upload_artist": "Iron Maiden",
"upload_title": "A Real Live One",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "The Beatles",
"upload_title": "Meet The Beatles!",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "Todd Snider",
"upload_title": "Last Songs For The Daily Planet",
"expected_artist": "Todd Snider",
"expected_title": "Songs for the Daily Planet",
},
{
"upload_artist": "John Prine",
"upload_title": "John Prine Live",
"expected_artist": "John Prine",
"expected_title": "John Prine (Live)",
},
{
"upload_artist": "Nightwish",
"upload_title": "Once Upon a Tour - Live In Buenos Aires",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "Eminem",
"upload_title": "Relapse: Refill [Disc 2]",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "Megadeth",
"upload_title": "Rust In Peace Live",
"expected_artist": None,
"expected_title": None,
},
{
"upload_artist": "The Beatles",
"upload_title": "The Beatles' Second Album",
"expected_artist": None,
"expected_title": None,
},
]


@pytest.fixture(name="sample_public_playlist")
def fixture_sample_playlist() -> str:
"""'00s Metal"""
Expand Down
7 changes: 0 additions & 7 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ def test_add_to_library(self, yt_browser: YTMusic, upload_song, config):

return self.verify_added_to_library(yt_browser, config, result)

def test_add_to_library_fuzzy(self, yt_browser: YTMusic, upload_song, config):
result = CliRunner().invoke(cli, ["delete-uploads", "-af"], standalone_mode=False, obj=yt_browser)
print(result.stdout)
assert result.exit_code == 0

return self.verify_added_to_library(yt_browser, config, result)

def verify_added_to_library(self, yt_browser: YTMusic, config, result):
albums_deleted, albums_total = result.return_value
assert albums_deleted >= 1, f"No uploads were deleted. {albums_total} uploads were found."
Expand Down
33 changes: 33 additions & 0 deletions tests/test_uploads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import ytmusicapi
from ytmusic_deleter import uploads


class TestUploads:
def test_fuzzy_matching(self, yt_browser: ytmusicapi.YTMusic, fuzzy_test_data):
score_cutoff = 85
num_correct = 0
for group in fuzzy_test_data:
upload_artist = group["upload_artist"]
upload_title = group["upload_title"]
match = uploads.add_album_to_library(upload_artist, upload_title, yt_browser, score_cutoff)
if match is None:
if group["expected_artist"] is None and group["expected_title"] is None:
print("Correctly failed to find match")
num_correct += 1
else:
print("Failed to find any match when one was expected")
print(f"\tExpected: {group['expected_artist']} - {group['expected_title']}")
elif not group["expected_artist"] and not group["expected_title"]:
print("Found match when none was expected")
print(f"\tFound: {match['artist']} - {match['title']}")
elif (
match["artist"].lower() == group["expected_artist"].lower()
and match["title"].lower() == group["expected_title"].lower()
):
print("Correctly found match")
num_correct += 1
else:
print("Found incorrect match")
print(f"\tExpected: {group['expected_artist']} - {group['expected_title']}")
print(f"\tActual: {match['artist']} - {match['title']}")
assert num_correct == len(fuzzy_test_data), f"Only {num_correct} out of {len(fuzzy_test_data)} were correct"
3 changes: 1 addition & 2 deletions ytmusic_deleter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ When you run `ytmusic-deleter` with no parameters, you will see see the usage in
`delete-uploads`:    Delete all tracks that you have uploaded to your YT Music library.

>Use the `--add-to-library` or `-a` option to add each album or song to your library from YouTube Music's online catalog before deleting it from your uploads. If a match could not be found, the album or song will remain in your uploads.
When using the `-a` option, you can also enable fuzzy matching with `--fuzzy` or `-f`. This is a less strict matching algorithm that will find more matches,
but may find inaccurrate matches in its current experimental state. Use the `--score-cutoff` or `-s` option to raise or lower the default matching score cutoff of 90. A value closer to 100 will be more strict, and a value closer to 0 will be less strict.
When using the `-a` option, you can also use the `--score-cutoff` or `-s` option to raise or lower the default matching score cutoff of 85. A value closer to 100 will be more strict, and a value closer to 0 will be less strict.

`remove-library`:    Remove all tracks that you have added to your library from within YouTube Music.

Expand Down
8 changes: 1 addition & 7 deletions ytmusic_deleter/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,11 @@ def whoami():
is_flag=True,
help="Add the corresponding album to your library before deleting a song from uploads.",
)
@click.option(
"--fuzzy",
"-f",
is_flag=True,
help="When using --add-to-library, this enables 'fuzzy' matching, allowing more flexibility when searching for matches among the YTM online catalog.", # noqa: B950
)
@click.option(
"--score-cutoff",
"-s",
default=90,
help="When combined with the --add-to-library and --fuzzy flags, this optional integer argument between 0 and 100 is used when finding matches in the YTM online catalog. No matches with a score less than this number will be added to your library. Defaults to 90.", # noqa: B950
help="When combined with the --add-to-library flag, this optional integer argument between 0 and 100 is used when finding matches in the YTM online catalog. No matches with a score less than this number will be added to your library. Defaults to 85.", # noqa: B950
)
@click.pass_context
def delete_uploads(ctx: click.Context, **kwargs):
Expand Down
84 changes: 31 additions & 53 deletions ytmusic_deleter/uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ def maybe_delete_uploaded_albums() -> tuple[int, int]:
if get_current_context().params["add_to_library"]:
if artist == const.UNKNOWN_ARTIST or album_title == const.UNKNOWN_ALBUM:
if artist == const.UNKNOWN_ARTIST:
logging.warn("\tSong is missing artist metadata.")
logging.warning("\tSong is missing artist metadata.")
if album_title == const.UNKNOWN_ALBUM:
logging.warn("\tSong is missing album metadata.")
logging.warn("\tSkipping match search and will not delete.")
logging.warning("\tSong is missing album metadata.")
logging.warning("\tSkipping match search and will not delete.")
update_progress(progress_bar)
continue
elif not add_album_to_library(artist, album_title):
logging.warn(
logging.warning(
f"\tNo album was added to library for '{artist} - {album_title}'. Will not delete from uploads."
)
update_progress(progress_bar)
Expand All @@ -74,18 +74,20 @@ def maybe_delete_uploaded_albums() -> tuple[int, int]:
return (albums_deleted, len(album_unique_songs))


def add_album_to_library(upload_artist, upload_title) -> bool:
def add_album_to_library(upload_artist, upload_title, yt_auth: YTMusic = None, score_cutoff: int = None) -> dict | None:
"""
Search for "<artist> <album title>" in the YTM online catalog.
`Return`: `True` if an album was added to library, `False` otherwise
`Return`: match dict if an album was added to library, `None` otherwise
"""
# Allow passing in yt_auth from pytest
if not yt_auth:
yt_auth: YTMusic = get_current_context().obj["YT_AUTH"]
logging.info(f"\tSearching YT Music for albums like: '{upload_artist} - {upload_title}'")
yt_auth: YTMusic = get_current_context().obj["YT_AUTH"]
search_results = yt_auth.search(f"{upload_artist} {upload_title}", filter="albums")
if not search_results:
logging.info("No search results were found. It's possible Google is limiting your requests. Try again later.")
return False
logging.info("\tNo search results were found.")
return None
logging.info(f"\tThere were {len(search_results)} album results.")

# collect all search results into a simplified list
Expand All @@ -98,62 +100,38 @@ def artist_is_correct(search_result):
search_results = list(filter(artist_is_correct, search_results))
if not search_results:
logging.info("\tNone of the search results had the correct artist name.")
return False

if get_current_context().params["fuzzy"]:
return None

def scorer(query, choice):
return fuzz.partial_ratio(query, choice)

# Find the best match for the album title among the search results
match, score = process.extractOne(
upload_title, search_results, processor=lambda x: x["title"] if isinstance(x, dict) else x, scorer=scorer
)
def scorer(query, choice):
return fuzz.ratio(query, choice)

# Make sure this result at least passes the score cutoff
if score < get_current_context().params["score_cutoff"]:
logging.info(
f"\tThe best search result '{match['artist']} - {match['title']}' had a match score of {score} which does not pass the score cutoff of {get_current_context().params['score_cutoff']}." # noqa: B950
)
return False
# Find the best match for the album title among the search results
match, score = process.extractOne(
upload_title, search_results, processor=lambda x: x["title"] if isinstance(x, dict) else x, scorer=scorer
)

# Add the match to the library
# Make sure this result at least passes the score cutoff
if score_cutoff is None:
score_cutoff = get_current_context().params["score_cutoff"]
if score < score_cutoff:
logging.info(
f"\tFound match: '{match['artist']} - {match['title']}' with a matching score of {score}. Adding to library..."
f"\tThe best search result '{match['artist']} - {match['title']}' had a match score of {score} which does not pass the score cutoff of {score_cutoff}." # noqa: B950
)
else:
# TODO fix up fuzzy matching algorithms enough to the point where we don't need this anymore
match = None
for search_result in search_results:
search_result_artist = search_result["artist"]
search_result_title = search_result["title"]
if upload_title in search_result_title:
match = search_result
logging.info(f"\tFound match: '{match['artist']} - {match['title']}'. Adding to library...")
break
else:
# Try again but strip out parenthetical expressions at the end of the title, and all symbols
upload_title = const.strip_parentheticals(upload_title)
search_result_title = const.strip_parentheticals(search_result_title)
logging.info(f"\t\tSanitized upload is: {upload_artist} - {upload_title}")
logging.info(f"\t\tSanitized match is: {search_result_artist} - {search_result_title}")
if upload_title in search_result_title:
match = search_result
logging.info(f"\tFound match: '{match['artist']} - {match['title']}'. Adding to library...")
break
logging.info(f"\t\tThis {'IS' if match else 'is NOT'} a match")
if not match:
logging.info(f"No matches were found in YTM for `{upload_artist} - {upload_title}`")
return False
return None

# Add the match to the library
logging.info(
f"\tFound match: '{match['artist']} - {match['title']}' with a matching score of {score}. Adding to library..."
)

catalog_album = yt_auth.get_album(match["browseId"])
success = yt_auth.rate_playlist(catalog_album["audioPlaylistId"], const.LIKE)
if success:
logging.info("\tAdded album to library.")
return True
return match
else:
logging.error("\tFailed to add album to library")
return False
return None


class SearchResult(TypedDict):
Expand Down

0 comments on commit a68797a

Please sign in to comment.