Skip to content

Commit

Permalink
[anilist] delete recsys endpoint
Browse files Browse the repository at this point in the history
remove polars and scikit-learn deps
  • Loading branch information
NextFire committed Jul 1, 2024
1 parent 844f836 commit 6fc6ff2
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 265 deletions.
6 changes: 0 additions & 6 deletions nanapi/models/anilist.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pydantic import BaseModel

from nanapi.database.anilist.account_merge import ACCOUNT_MERGE_SERVICE
from nanapi.database.anilist.media_select import MediaSelectResult
from nanapi.models.waicolle import RANKS, WaicolleRank


Expand Down Expand Up @@ -352,8 +351,3 @@ class StaffNameAutocompleteResult(BaseModel):
id_al: int
name_user_preferred: str
name_native: str | None = None


class RecommendationResult(BaseModel):
media: MediaSelectResult
score: float
28 changes: 0 additions & 28 deletions nanapi/routers/anilist.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from typing import cast

import polars as pl
from edgedb.errors import ConstraintViolationError
from fastapi import HTTPException, status
from fastapi.responses import StreamingResponse
Expand Down Expand Up @@ -35,12 +32,10 @@
MEDIA_TYPES,
CharaNameAutocompleteResult,
MediaTitleAutocompleteResult,
RecommendationResult,
StaffNameAutocompleteResult,
UpsertAnilistAccountBody,
)
from nanapi.settings import INSTANCE_NAME
from nanapi.utils.anilist import predict_scores
from nanapi.utils.clients import get_edgedb, get_meilisearch
from nanapi.utils.collages import chara_collage, media_collage
from nanapi.utils.fastapi import HTTPExceptionModel, NanAPIRouter
Expand Down Expand Up @@ -90,29 +85,6 @@ async def get_account_entries(discord_id: int,
return resp


@router.oauth2.get('/accounts/{discord_id}/recommendations',
response_model=list[RecommendationResult])
async def get_account_recommendations(discord_id: int):
p_scores, entries = await predict_scores()
user_entries = entries.filter(pl.col('discord_id') == discord_id)
if str(discord_id) not in p_scores:
return []
top_50 = (
p_scores
.filter(~pl.col('id_al').is_in(user_entries.select('id_al').to_series()))
.select(['id_al', str(discord_id)])
.sort(str(discord_id), descending=True)
.head(50)
)
ids_al = cast(list[int], top_50.select('id_al').to_series().to_list())
medias = await media_select(get_edgedb(), ids_al=ids_al)
medias_map = {m.id_al: m for m in medias}
return [
RecommendationResult(media=medias_map[id_al], score=score)
for id_al, score in top_50.rows()
]


##########
# Medias #
##########
Expand Down
79 changes: 1 addition & 78 deletions nanapi/utils/anilist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,19 @@
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import timedelta
from itertools import count, filterfalse
from typing import Any, Generator, Generic, Optional, Self, Type, TypeVar

import aiohttp
import numpy as np
import orjson
import polars as pl
from asyncache import cached
from cachetools import TTLCache
from edgedb import AsyncIOExecutor
from pydantic import TypeAdapter
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from toolz.curried import concat
from toolz.itertoolz import partition_all

from nanapi.database.anilist.c_edge_merge_combined_by_chara import c_edge_merge_combined_by_chara
from nanapi.database.anilist.c_edge_merge_multiple import c_edge_merge_multiple
from nanapi.database.anilist.chara_merge_multiple import chara_merge_multiple
from nanapi.database.anilist.entry_select_all import entry_select_all
from nanapi.database.anilist.media_merge_combined_charas import media_merge_combined_charas
from nanapi.database.anilist.media_merge_multiple import media_merge_multiple
from nanapi.database.anilist.media_select_all_ids import MediaSelectAllIdsResult
Expand All @@ -46,7 +38,7 @@
MediaType,
)
from nanapi.settings import LOW_PRIORITY_THRESH, MAL_CLIENT_ID
from nanapi.utils.clients import get_edgedb, get_session
from nanapi.utils.clients import get_session
from nanapi.utils.misc import default_backoff

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1183,72 +1175,3 @@ async def edgedb_split_merge(executor: AsyncIOExecutor, medias: list,
await staff_merge_multiple(executor, staffs=part)
for part in partition_all(MERGE_COMBINED_MAX_SIZE, edges):
await c_edge_merge_multiple(executor, edges=part)


async def get_entries_df():
entries_data = await entry_select_all(get_edgedb())
entries = pl.DataFrame([
dict(
status=e.status.value,
score=e.score,
id_al=e.media.id_al,
discord_id=e.account.user.discord_id,
) for e in entries_data
])
return entries


def entries_to_scores_df(entries: pl.DataFrame) -> pl.DataFrame:
scores = entries.pivot('discord_id', index='id_al',
values='score', aggregate_function='max')
return scores


@cached(cache=TTLCache(1024, ttl=timedelta(hours=6).seconds))
async def predict_scores() -> tuple[pl.DataFrame, pl.DataFrame]:
entries = await get_entries_df()

# Remove PLANNING entries
entries = entries.filter(pl.col('status') != 'PLANNING')

# Remove users without any scored entries (null std = useless data)
entries = entries.filter(pl.col('score').sum().over('discord_id') > 0)

# Fill missing scores
entries = entries.with_columns(
pl.when(pl.col('score') > 0)
.then(pl.col('score')) # Nothing to do
.when(pl.col('status') == 'CURRENT')
.then(pl.col('score').filter(pl.col('score') > 0).quantile(0.50).over('discord_id'))
.when(pl.col('status') == 'COMPLETED')
.then(pl.col('score').filter(pl.col('score') > 0).quantile(0.25).over('discord_id'))
.when(pl.col('status') == 'PAUSED')
.then(pl.col('score').filter(pl.col('score') > 0).quantile(0.25).over('discord_id'))
.when(pl.col('status') == 'DROPPED')
.then(0)
.when(pl.col('status') == 'REPEATING')
.then(pl.col('score').filter(pl.col('score') > 0).quantile(0.75).over('discord_id'))
.otherwise(pl.col('score')) # Should not happen
.alias('score')
)

scores = entries_to_scores_df(entries)
scores_np = scores.select(pl.all().exclude('id_al')).to_numpy()

# Standardize
scaler = preprocessing.StandardScaler()
std_scores = scaler.fit_transform(scores_np)

# SVD
svd = TruncatedSVD(n_components=2)
decomp = svd.fit_transform(np.nan_to_num(std_scores))

# Reconstruct
p_scores_np = svd.inverse_transform(decomp)
p_scores_np = scaler.inverse_transform(p_scores_np)
p_scores = pl.concat(
(scores.select('id_al'),
pl.DataFrame(p_scores_np, schema=scores.columns[1:])),
how='horizontal')

return p_scores, entries
153 changes: 2 additions & 151 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ python-multipart = "0.0.9"
passlib = { version = "1.7.4", extras = ["bcrypt"] }
cachetools = "5.3.3"
asyncache = "0.3.1"
polars = "1.0.0"
scikit-learn = "1.5.0"
hypercorn = "0.17.3"
tzdata = "2024.1"
pyjwt = "2.8.0"
Expand Down

0 comments on commit 6fc6ff2

Please sign in to comment.