diff --git a/NEWS.md b/NEWS.md index d417976c9..9d8b1aeb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,12 +5,21 @@ Licensed under the MIT License. # What's New +## Update May 2, 2024 + +We have a new release [Recommenders 1.2.0](https://github.com/microsoft/recommenders/releases/tag/1.2.0)! + +So many changes since our last release. We have full tests on Python 3.8 to 3.11 (around 1800 tests), upgraded performance in many algorithms, reviewed notebooks, and many more improvements. + + ## Update October 10, 2023 We are pleased to announce that this repository (formerly known as Microsoft Recommenders, https://github.com/microsoft/recommenders), has joined the [Linux Foundation of AI and Data](https://lfaidata.foundation/) (LF AI & Data)! The new organization, `recommenders-team`, reflects this change. We hope this move makes it easy for anyone to contribute! Our objective continues to be building an ecosystem and a community to sustain open source innovations and collaborations in recommendation systems. +Now to access the repo, instead of going to https://github.com/microsoft/recommenders, you need to go to https://github.com/recommenders-team/recommenders. The old URL will still resolve to the new one, but we recommend that you update your bookmarks. + ## Update August 18, 2023 We moved to a new organization! Now to access the repo, instead of going to https://github.com/microsoft/recommenders, you need to go to https://github.com/recommenders-team/recommenders. The old URL will still resolve to the new one, but we recommend you to update your bookmarks. diff --git a/README.md b/README.md index 89ef90ecf..35f526d1a 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,11 @@ Licensed under the MIT License. -## What's New (October, 2023) +## What's New (May, 2024) -We are pleased to announce that this repository (formerly known as Microsoft Recommenders, https://github.com/microsoft/recommenders), has joined the [Linux Foundation of AI and Data](https://lfaidata.foundation/) (LF AI & Data)! The new organization, `recommenders-team`, reflects this change. +We have a new release [Recommenders 1.2.0](https://github.com/microsoft/recommenders/releases/tag/1.2.0)! -We hope this move makes it easy for anyone to contribute! Our objective continues to be building an ecosystem and a community to sustain open source innovations and collaborations in recommendation systems. - -Now to access the repo, instead of going to https://github.com/microsoft/recommenders, you need to go to https://github.com/recommenders-team/recommenders. The old URL will still resolve to the new one, but we recommend that you update your bookmarks. +So many changes since our last release. We have full tests on Python 3.8 to 3.11 (around 1800 tests), upgraded performance in many algorithms, reviewed notebooks, and many more improvements. ## Introduction @@ -35,6 +33,8 @@ Several utilities are provided in [recommenders](recommenders) to support common For a more detailed overview of the repository, please see the documents on the [wiki page](https://github.com/microsoft/recommenders/wiki/Documents-and-Presentations). +For some of the practical scenarios where recommendation systems have been applied, see [scenarios](scenarios). + ## Getting Started We recommend [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html?highlight=environment#conda-environment) for environment management, and [VS Code](https://code.visualstudio.com/) for development. To install the recommenders package and run an example notebook on Linux/WSL: diff --git a/SETUP.md b/SETUP.md index f06995e6e..814118a49 100644 --- a/SETUP.md +++ b/SETUP.md @@ -150,8 +150,6 @@ Currently, tests are done on **Python CPU** (the base environment), **Python GPU Another way is to build a docker image and use the functions inside a [docker container](#setup-guide-for-docker). -Another alternative is to run all the recommender utilities directly from a local copy of the source code. This requires installing all the necessary dependencies from Anaconda and PyPI. For instructions on how to do this, see [this guide](conda.md). - ## Setup for Making a Release The process of making a new release and publishing it to [PyPI](https://pypi.org/project/recommenders/) is as follows: diff --git a/recommenders/__init__.py b/recommenders/__init__.py index e28bf197f..87998b029 100644 --- a/recommenders/__init__.py +++ b/recommenders/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. __title__ = "Recommenders" -__version__ = "1.1.1" +__version__ = "1.2.0" __author__ = "Recommenders contributors" __license__ = "MIT" __copyright__ = "Copyright 2018-present Recommenders contributors." diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index e9adf621a..dff164ab4 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -541,6 +541,63 @@ def recall_at_k( return (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users +def r_precision_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, + **_, +): + """R-precision at K. + + R-precision can be defined as the precision@R for each user, where R is the + numer of relevant items for the query. Its also equivalent to the recall at + the R-th position. + + Note: + As R can be high, in this case, the k indicates the maximum possible R. + If every user has more than k true items, then r-precision@k is equal to + precision@k. You might need to raise the k value to get meaningful results. + + Args: + rating_true (pandas.DataFrame): True DataFrame + rating_pred (pandas.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the + top k items are directly provided, so there is no need to compute the relevancy operation. + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than + k items exist for a user in rating_true. + """ + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + df_merged = df_hit.merge(df_hit_count[[col_user, 'actual']]) + df_merged = df_merged[df_merged['rank'] <= df_merged['actual']] + + return (df_merged.groupby(col_user).size() / df_hit_count.set_index(col_user)['actual']).mean() + + def ndcg_at_k( rating_true, rating_pred, @@ -824,6 +881,7 @@ def get_top_k_items( exp_var.__name__: exp_var, precision_at_k.__name__: precision_at_k, recall_at_k.__name__: recall_at_k, + r_precision_at_k.__name__: r_precision_at_k, ndcg_at_k.__name__: ndcg_at_k, map_at_k.__name__: map_at_k, map.__name__: map, diff --git a/recommenders/utils/python_utils.py b/recommenders/utils/python_utils.py index 6efdedfed..36fb3f815 100644 --- a/recommenders/utils/python_utils.py +++ b/recommenders/utils/python_utils.py @@ -62,7 +62,7 @@ def jaccard(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = cooccurrence / (diag_rows + diag_cols - cooccurrence) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def lift(cooccurrence): @@ -85,7 +85,7 @@ def lift(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = cooccurrence / (diag_rows * diag_cols) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def mutual_information(cooccurrence): @@ -106,7 +106,7 @@ def mutual_information(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = np.log2(cooccurrence.shape[0] * lift(cooccurrence)) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def lexicographers_mutual_information(cooccurrence): @@ -128,7 +128,7 @@ def lexicographers_mutual_information(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = cooccurrence * mutual_information(cooccurrence) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def cosine_similarity(cooccurrence): @@ -151,7 +151,7 @@ def cosine_similarity(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = cooccurrence / np.sqrt(diag_rows * diag_cols) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def inclusion_index(cooccurrence): @@ -173,7 +173,7 @@ def inclusion_index(cooccurrence): with np.errstate(invalid="ignore", divide="ignore"): result = cooccurrence / np.minimum(diag_rows, diag_cols) - return np.array(result) + return np.array(result) if isinstance(result, np.ndarray) else result.toarray() def get_top_k_scored_items(scores, top_k, sort_top_k=False): diff --git a/setup.py b/setup.py index c5fc49bb8..db4e1012b 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "retrying>=1.3.4,<2", "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df "scikit-surprise>=1.1.3", - "scipy>=1.10.1,<1.11.0", # FIXME: We limit <1.11.0 until #1954 is fixed + "scipy>=1.10.1", "seaborn>=0.13.0,<1", # requires matplotlib, packaging "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm ] diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index 4f0d4730b..e2f6dc149 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -25,6 +25,7 @@ exp_var, get_top_k_items, precision_at_k, + r_precision_at_k, recall_at_k, ndcg_at_k, map_at_k, @@ -366,6 +367,20 @@ def test_python_recall_at_k(rating_true, rating_pred, rating_nohit): assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) +def test_python_r_precision(rating_true, rating_pred, rating_nohit): + assert r_precision_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1, TOL) + assert r_precision_at_k(rating_true, rating_nohit, k=5) == 0.0 + assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx(0.21111, TOL) + assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx(0.24444, TOL) + # Equivalent to precision + assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) + + def test_python_auc(rating_true_binary, rating_pred_binary): assert auc( rating_true=rating_true_binary,