diff --git a/lenskit/lenskit/knn/item.py b/lenskit/lenskit/knn/item.py index aa5011218..48b340b6a 100644 --- a/lenskit/lenskit/knn/item.py +++ b/lenskit/lenskit/knn/item.py @@ -16,7 +16,8 @@ import numpy as np import torch -from typing_extensions import Callable, Optional, TypeAlias, override +from scipy.sparse import csr_array +from typing_extensions import Optional, override from lenskit import util from lenskit.data import Dataset, FeedbackType, ItemList, QueryInput, RecQuery, Vocabulary @@ -25,6 +26,7 @@ from lenskit.math.sparse import normalize_sparse_rows, safe_spmv from lenskit.parallel import ensure_parallel_init from lenskit.pipeline import Component, Trainable +from lenskit.util.torch import inference_mode _log = logging.getLogger(__name__) MAX_BLOCKS = 1024 @@ -39,6 +41,12 @@ class ItemKNNScorer(Component, Trainable): explicit-feedback mode, its output is equivalent to that of the Java version. + .. note:: + + This component must be used with queries containing the user's history, + either directly in the input or by wiring its query input to the output of a + user history component (e.g., :class:`~lenskit.basic.UserTrainingHistoryLookup`). + Args: nnbrs: The maximum number of neighbors for scoring each item (``None`` for @@ -66,16 +74,14 @@ class ItemKNNScorer(Component, Trainable): items_: Vocabulary "Vocabulary of item IDs." - item_means_: torch.Tensor | None + item_means_: np.ndarray[int, np.dtype[np.float32]] | None "Mean rating for each known item." - item_counts_: torch.Tensor + item_counts_: np.ndarray[int, np.dtype[np.int32]] "Number of saved neighbors for each item." - sim_matrix_: torch.Tensor + sim_matrix_: csr_array "Similarity matrix (sparse CSR tensor)." users_: Vocabulary "Vocabulary of user IDs." - rating_matrix_: torch.Tensor - "Normalized rating matrix to look up user ratings at prediction time." def __init__( self, @@ -111,6 +117,7 @@ def is_trained(self) -> bool: return hasattr(self, "items_") @override + @inference_mode def train(self, data: Dataset): """ Train a model. @@ -175,16 +182,15 @@ def train(self, data: Dataset): _log.info("[%s] computed %d neighbor pairs", self._timer, len(smat.col_indices())) self.items_ = data.items - self.item_means_ = means - self.item_counts_ = torch.diff(smat.crow_indices()) - self.sim_matrix_ = smat + self.item_means_ = means.numpy() if means is not None else None + self.item_counts_ = torch.diff(smat.crow_indices()).numpy() + self.sim_matrix_ = csr_array( + (smat.values(), smat.col_indices(), smat.crow_indices()), smat.shape + ) self.users_ = data.users - self.rating_matrix_ = init_rmat _log.debug("[%s] done, memory use %s", self._timer, util.max_memory()) - return self - - def _compute_similarities(self, rmat: torch.Tensor): + def _compute_similarities(self, rmat: torch.Tensor) -> torch.Tensor: nitems, nusers = rmat.shape bs = max(self.block_size, nitems // MAX_BLOCKS) @@ -195,72 +201,104 @@ def _compute_similarities(self, rmat: torch.Tensor): return smat.to(torch.float32) @override + @inference_mode def __call__(self, query: QueryInput, items: ItemList) -> ItemList: query = RecQuery.create(query) _log.debug("predicting %d items for user %s", len(items), query.user_id) ratings = query.user_items - if ratings is None: - if query.user_id is None: - warnings.warn( - "cannot recommend without without either user ID or items", DataWarning - ) - return ItemList(items, scores=np.nan) - - upos = self.users_.number(query.user_id, missing=None) - if upos is None: - _log.debug("user %s missing, returning empty predictions", query.user_id) - return ItemList(items, scores=np.nan) - row = self.rating_matrix_[upos] # type: ignore - ratings = ItemList( - item_nums=row.indices()[0], rating=row.values(), vocabulary=self.items_ - ) + if ratings is None or len(ratings) == 0: + if ratings is None: + warnings.warn("no user history, did you omit a history component?", DataWarning) + _log.debug("user has no history, returning") + return ItemList(items, scores=np.nan) # set up rating array # get rated item positions & limit to in-model items - ri_pos = ratings.numbers(format="torch", vocabulary=self.items_, missing="negative") - ri_mask = ri_pos >= 0 - ri_vpos = ri_pos[ri_mask] - n_valid = len(ri_vpos) + ri_nums = ratings.numbers(format="torch", vocabulary=self.items_, missing="negative") + ri_mask = ri_nums >= 0 + ri_valid_nums = ri_nums[ri_mask] + n_valid = len(ri_valid_nums) _log.debug("user %s: %d of %d rated items in model", query.user_id, n_valid, len(ratings)) if self.feedback == "explicit": - ri_vals = ratings.field("rating", "torch") + ri_vals = ratings.field("rating", "numpy") if ri_vals is None: raise RuntimeError("explicit-feedback scorer must have ratings") - ri_vals = ri_vals[ri_mask].to(torch.float64) + ri_vals = np.require(ri_vals[ri_mask], np.float32) else: - ri_vals = torch.full((n_valid,), 1.0, dtype=torch.float64) + ri_vals = np.full(n_valid, 1.0, dtype=np.float32) # mean-center the rating array + if self.item_means_ is not None: + ri_vals -= self.item_means_[ri_valid_nums] + + # convert target item information + ti_nums = items.numbers(vocabulary=self.items_, missing="negative") + ti_mask = ti_nums >= 0 + ti_valid_nums = ti_nums[ti_mask] + + # subset the model to rated and target items + model = self.sim_matrix_ + model = model[ri_valid_nums, :] + assert isinstance(model, csr_array) + model = model[:, ti_valid_nums] + assert isinstance(model, csr_array) + # convert to CSC so we can count neighbors per target item. + model = model.tocsc() + + # count neighborhood sizes + sizes = np.diff(model.indptr) + # which neighborhoods are usable? (at least min neighbors) + scorable = sizes >= self.min_nbrs + + # fast-path neighborhoods that fit within max neighbors + fast = sizes <= self.nnbrs + ti_fast_mask = ti_mask.copy() + ti_fast_mask[ti_mask] = scorable & fast + + scores = np.full(len(items), np.nan, dtype=np.float32) + fast_mod = model[:, scorable & fast] if self.feedback == "explicit": - assert self.item_means_ is not None - ri_vals -= self.item_means_[ri_vpos] - - # now compute the predictions - if self.feedback == "explicit": - sims = _predict_weighted_average( - self.sim_matrix_, (self.min_nbrs, self.nnbrs), ri_vals, ri_vpos - ) - sims += self.item_means_ + scores[ti_fast_mask] = ri_vals @ fast_mod + scores[ti_fast_mask] /= fast_mod.sum(axis=0) else: - sims = _predict_sum(self.sim_matrix_, (self.min_nbrs, self.nnbrs), ri_vals, ri_vpos) - - # and prepare the output - scores = torch.full((len(items),), np.nan, dtype=sims.dtype) - out_nums = items.numbers("torch", vocabulary=self.items_, missing="negative") - out_good = out_nums >= 0 - scores[out_good] = sims[out_nums[out_nums >= 0]] - results = ItemList(items, scores=scores) + scores[ti_fast_mask] = fast_mod.sum(axis=0) + + # slow path: neighborhoods that we need to truncate. we will convert to + # PyTorch, make a dense matrix (this is usually small enough to be + # usable), and use the Torch topk function. + slow_mat = model.T[~fast, :] + assert isinstance(slow_mat, csr_array) + n_slow, _ = slow_mat.shape + if n_slow: + # mask for the slow items. + ti_slow_mask = ti_mask.copy() + ti_slow_mask[ti_mask] = ~fast + + slow_mat = torch.from_numpy(slow_mat.toarray()) + slow_trimmed, slow_inds = torch.topk(slow_mat, self.nnbrs) + assert slow_trimmed.shape == (n_slow, self.nnbrs) + if self.feedback == "explicit": + scores[ti_slow_mask] = torch.sum( + slow_trimmed * torch.from_numpy(ri_vals)[slow_inds], axis=1 + ).numpy() + scores[ti_slow_mask] /= torch.sum(slow_trimmed, axis=1).numpy() + else: + scores[ti_slow_mask] = torch.sum(slow_trimmed, axis=1).numpy() + + # re-add the mean ratings in implicit feedback + if self.item_means_ is not None: + scores[ti_mask] += self.item_means_[ti_valid_nums] _log.debug( "user %s: predicted for %d of %d items", query.user_id, - int(torch.isfinite(scores).sum()), + int(np.isfinite(scores).sum()), len(items), ) - return results + return ItemList(items, scores=scores) def __str__(self): return "ItemItem(nnbrs={}, msize={})".format(self.nnbrs, self.save_nbrs) @@ -361,172 +399,3 @@ def _sim_blocks( values=c_values, size=(nitems, nitems), ) - - -def _predict_weighted_average( - model: torch.Tensor, - nrange: tuple[int, int], - rate_v: torch.Tensor, - rated: torch.Tensor, -) -> torch.Tensor: - "Weighted average prediction function" - nitems, _ni = model.shape - assert nitems == _ni - min_nbrs, max_nbrs = nrange - - # we proceed rating-by-rating, and accumulate results - scores = torch.zeros(nitems) - t_sims = torch.zeros(nitems) - counts = torch.zeros(nitems, dtype=torch.int32) - # these store the similarities and values for neighbors, so we can un-count - nbr_sims = torch.empty((nitems, max_nbrs)) - nbr_vals = torch.empty((nitems, max_nbrs)) - # and this stores the smallest similarity so far for each item - nbr_min = torch.full((nitems,), torch.finfo().max) - - for i, iidx in enumerate(rated): - row = model[int(iidx)] - row_is = row.indices()[0] - row_vs = row.values() - assert row_is.shape == row_vs.shape - - row_avs = torch.abs(row_vs) - fast = counts[row_is] < max_nbrs - - # save the fast-path items - if torch.any(fast): - ris_fast = row_is[fast] - vs_fast = row_vs[fast] - avs_fast = row_avs[fast] - vals_fast = vs_fast * rate_v[i] - nbr_sims[ris_fast, counts[ris_fast]] = vs_fast - nbr_vals[ris_fast, counts[ris_fast]] = vals_fast - counts[ris_fast] += 1 - t_sims[ris_fast] += avs_fast - scores[ris_fast] += vals_fast - nbr_min[ris_fast] = torch.minimum(nbr_min[ris_fast], vs_fast) - - # skip early if we're done - if torch.all(fast): - continue - - # now we have the slow-path items - slow = torch.logical_not(fast) - ris_slow = row_is[slow] - rvs_slow = row_vs[slow] - # which slow items might actually need an update? - exc = rvs_slow > nbr_min[ris_slow] - if not torch.any(exc): - continue - - ris_slow = ris_slow[exc] - rvs_slow = rvs_slow[exc] - - # this is brute-force linear search for simplicity right now - # for each, find the neighbor that's the smallest: - min_sims, mins = torch.min(nbr_sims[ris_slow], dim=1) - assert torch.all(min_sims < rvs_slow) - - # now we need to update values: add in new and remove old - min_vals = nbr_vals[ris_slow, mins] - ravs_slow = row_avs[slow][exc] - slow_vals = rvs_slow * rate_v[i] - t_sims[ris_slow] += ravs_slow - min_sims.abs() - scores[ris_slow] += slow_vals - min_vals - # and save - nbr_sims[ris_slow, mins] = ravs_slow - nbr_vals[ris_slow, mins] = slow_vals - # and now we need to update the saved minimums - nm_sims, _nm_is = torch.min(nbr_sims[ris_slow], dim=1) - nbr_min[ris_slow] = nm_sims - - # compute averages for items that pass match the threshold - mask = counts >= min_nbrs - scores[mask] /= t_sims[mask] - scores[torch.logical_not(mask)] = torch.nan - - return scores - - -def _predict_sum( - model: torch.Tensor, - nrange: tuple[int, int], - rate_v: torch.Tensor, - rated: torch.Tensor, -) -> torch.Tensor: - "Sum-of-similarities prediction function" - nitems, _ni = model.shape - assert nitems == _ni - min_nbrs, max_nbrs = nrange - _msg(logging.DEBUG, f"sum-scoring with {len(rated)} items") - - # we proceed rating-by-rating, and accumulate results - t_sims = torch.zeros(nitems) - counts = torch.zeros(nitems, dtype=torch.int32) - nbr_sims = torch.zeros((nitems, max_nbrs)) - # and this stores the smallest similarity so far for each item - nbr_min = torch.full((nitems,), torch.finfo().max) - - for i, iidx in enumerate(rated): - iidx = int(iidx) - row = model[iidx] - row_is = row.indices()[0] - row_vs = row.values() - assert row_is.shape == row_vs.shape - - fast = counts[row_is] < max_nbrs - - # save the fast-path items - if torch.any(fast): - ris_fast = row_is[fast] - vs_fast = row_vs[fast] - nbr_sims[ris_fast, counts[ris_fast]] = vs_fast - counts[ris_fast] += 1 - t_sims[ris_fast] += vs_fast - nbr_min[ris_fast] = torch.minimum(nbr_min[ris_fast], vs_fast) - - # skip early if we're done - if torch.all(fast): - continue - - # now we have the slow-path items - slow = torch.logical_not(fast) - ris_slow = row_is[slow] - rvs_slow = row_vs[slow] - # which slow items might actually need an update? - exc = rvs_slow > nbr_min[ris_slow] - if not torch.any(exc): - continue - - ris_slow = ris_slow[exc] - rvs_slow = rvs_slow[exc] - - # this is brute-force linear search for simplicity right now - # for each, find the neighbor that's the smallest: - min_sims, mins = torch.min(nbr_sims[ris_slow], dim=1) - - # now we need to update values: add in new and remove old - # anywhere our new neighbor is grater than smallest, replace smallest - t_sims[ris_slow] -= min_sims - t_sims[ris_slow] += rvs_slow - # and save - nbr_sims[ris_slow, mins] = rvs_slow - # save the minimums - nm_sims, _nm_is = torch.min(nbr_sims[ris_slow], dim=1) - nbr_min[ris_slow] = nm_sims - - # compute averages for items that pass match the threshold - t_sims[counts < min_nbrs] = torch.nan - - return t_sims - - -AggFun: TypeAlias = Callable[ - [ - torch.Tensor, - tuple[int, int], - torch.Tensor, - torch.Tensor, - ], - torch.Tensor, -] diff --git a/lenskit/lenskit/util/torch.py b/lenskit/lenskit/util/torch.py new file mode 100644 index 000000000..88f5bf456 --- /dev/null +++ b/lenskit/lenskit/util/torch.py @@ -0,0 +1,20 @@ +""" +PyTorch utility functions. +""" + +import functools + +import torch + + +def inference_mode(func): + """ + Function decorator that puts PyTorch in inference mode. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with torch.inference_mode(): + return func(*args, **kwargs) + + return wrapper diff --git a/lenskit/tests/models/test_knn_item_item.py b/lenskit/tests/models/test_knn_item_item.py index 3ef5a3d34..cb0957cca 100644 --- a/lenskit/tests/models/test_knn_item_item.py +++ b/lenskit/tests/models/test_knn_item_item.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd import torch +from numpy.typing import NDArray from scipy import linalg as la import pytest @@ -18,15 +19,18 @@ from lenskit import batch from lenskit.basic import BiasScorer +from lenskit.basic.history import UserTrainingHistoryLookup from lenskit.batch import BatchPipelineRunner from lenskit.data import ItemList, ItemListCollection, UserIDKey, Vocabulary, from_interactions_df from lenskit.data.bulk import dict_to_df, iter_item_lists from lenskit.diagnostics import ConfigWarning, DataWarning from lenskit.knn.item import ItemKNNScorer from lenskit.metrics import MAE, RBP, RMSE, RecipRank, RunAnalysis, call_metric, quick_measure_model +from lenskit.operations import score from lenskit.pipeline import RecPipelineBuilder, topn_pipeline from lenskit.splitting import SampleFrac, crossfold_users from lenskit.testing import BasicComponentTests, ScorerTests, wantjit +from lenskit.util.torch import inference_mode _log = logging.getLogger(__name__) @@ -79,13 +83,13 @@ def test_ii_train(): algo = ItemKNNScorer(30, save_nbrs=500) algo.train(simple_ds) - assert isinstance(algo.item_means_, torch.Tensor) - assert isinstance(algo.item_counts_, torch.Tensor) + assert isinstance(algo.item_means_, np.ndarray) + assert isinstance(algo.item_counts_, np.ndarray) matrix = algo.sim_matrix_ test_means = simple_ratings.groupby("item")["rating"].mean() test_means = test_means.reindex(algo.items_.ids()) - assert np.all(algo.item_means_.numpy() == test_means.values.astype("f8")) + assert np.all(algo.item_means_ == test_means.values.astype("f8")) # 6 is a neighbor of 7 six, seven = algo.items_.numbers([6, 7]) @@ -103,20 +107,20 @@ def test_ii_train(): num = six_v.dot(seven_v) assert matrix[six, seven] == approx(num / denom, 0.01) # type: ignore - assert all(np.logical_not(np.isnan(algo.sim_matrix_.values().numpy()))) - assert all(algo.sim_matrix_.values() > 0) + assert all(np.logical_not(np.isnan(algo.sim_matrix_.data))) + assert all(algo.sim_matrix_.data > 0) # a little tolerance - assert all(algo.sim_matrix_.values() < 1 + 1.0e-6) + assert all(algo.sim_matrix_.data < 1 + 1.0e-6) def test_ii_train_unbounded(): algo = ItemKNNScorer(30) algo.train(simple_ds) - assert all(np.logical_not(np.isnan(algo.sim_matrix_.values()))) - assert all(algo.sim_matrix_.values() > 0) + assert all(np.logical_not(np.isnan(algo.sim_matrix_.data))) + assert all(algo.sim_matrix_.data > 0) # a little tolerance - assert all(algo.sim_matrix_.values() < 1 + 1.0e-6) + assert all(algo.sim_matrix_.data < 1 + 1.0e-6) # 6 is a neighbor of 7 matrix = algo.sim_matrix_ @@ -135,10 +139,13 @@ def test_ii_train_unbounded(): def test_ii_simple_predict(): + history = UserTrainingHistoryLookup() + history.train(simple_ds) algo = ItemKNNScorer(30, save_nbrs=500) algo.train(simple_ds) - res = algo(3, ItemList([6])) + q = history(3) + res = algo(q, ItemList([6])) _log.info("got predictions: %s", res) assert res is not None assert len(res) == 1 @@ -147,10 +154,13 @@ def test_ii_simple_predict(): def test_ii_simple_implicit_predict(): + history = UserTrainingHistoryLookup() + history.train(simple_ds) algo = ItemKNNScorer(30, feedback="implicit") algo.train(from_interactions_df(simple_ratings.loc[:, ["user", "item"]])) - res = algo(3, ItemList([6])) + q = history(3) + res = algo(q, ItemList([6])) assert res is not None assert len(res) == 1 assert 6 in res.ids() @@ -159,10 +169,13 @@ def test_ii_simple_implicit_predict(): def test_ii_simple_predict_unknown(): + history = UserTrainingHistoryLookup() + history.train(simple_ds) algo = ItemKNNScorer(30, save_nbrs=500) algo.train(simple_ds) - res = algo(3, ItemList([6, 100])) + q = history(3) + res = algo(q, ItemList([6, 100])) _log.info("got predictions: %s", res) assert res is not None assert len(res) == 2 @@ -181,6 +194,7 @@ def test_ii_warns_center(): @wantjit @mark.slow +@inference_mode def test_ii_train_ml100k(tmp_path, ml_100k): "Test an unbounded model on ML-100K" algo = ItemKNNScorer(30) @@ -189,13 +203,13 @@ def test_ii_train_ml100k(tmp_path, ml_100k): _log.info("testing model") - assert all(np.logical_not(np.isnan(algo.sim_matrix_.values()))) - assert all(algo.sim_matrix_.values() > 0) + assert all(np.logical_not(np.isnan(algo.sim_matrix_.data))) + assert all(algo.sim_matrix_.data > 0) # a little tolerance - assert np.max(algo.sim_matrix_.values().numpy()) <= 1 + assert np.max(algo.sim_matrix_.data) <= 1 - assert algo.item_counts_.sum() == len(algo.sim_matrix_.values()) + assert algo.item_counts_.sum() == len(algo.sim_matrix_.data) means = ml_100k.groupby("item_id").rating.mean() assert means[algo.items_.ids()].values == approx(algo.item_means_) @@ -210,16 +224,17 @@ def test_ii_train_ml100k(tmp_path, ml_100k): with fn.open("rb") as modf: restored = pickle.load(modf) - assert all(restored.sim_matrix_.values() > 0) + assert all(restored.sim_matrix_.data > 0) r_mat = restored.sim_matrix_ o_mat = algo.sim_matrix_ - assert all(r_mat.values() == o_mat.values()) + assert all(r_mat.data == o_mat.data) @wantjit @mark.slow +@inference_mode def test_ii_large_models(rng, ml_ratings, ml_ds): "Several tests of large trained I-I models" _log.info("training limited model") @@ -232,17 +247,17 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): algo_ub.train(ml_ds) _log.info("testing models") - assert all(np.logical_not(np.isnan(algo_lim.sim_matrix_.values()))) - assert algo_lim.sim_matrix_.values().min() > 0 + assert all(np.logical_not(np.isnan(algo_lim.sim_matrix_.data))) + assert algo_lim.sim_matrix_.data.min() > 0 # a little tolerance - assert algo_lim.sim_matrix_.values().max() <= 1 + assert algo_lim.sim_matrix_.data.max() <= 1 means = ml_ratings.groupby("item_id").rating.mean() assert means[algo_lim.items_.ids()].values == approx(algo_lim.item_means_) - assert all(np.logical_not(np.isnan(algo_ub.sim_matrix_.values()))) - assert algo_ub.sim_matrix_.values().min() > 0 - assert algo_ub.sim_matrix_.values().max() <= 1 + assert all(np.logical_not(np.isnan(algo_ub.sim_matrix_.data))) + assert algo_ub.sim_matrix_.data.min() > 0 + assert algo_ub.sim_matrix_.data.max() <= 1 means = ml_ratings.groupby("item_id").rating.mean() assert means[algo_ub.items_.ids()].values == approx(algo_ub.item_means_) @@ -258,10 +273,10 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): _log.info("make sure the similarity matrix is sorted") for i in range(algo_lim.items_.size): - sp = algo_lim.sim_matrix_.crow_indices()[i] - ep = algo_lim.sim_matrix_.crow_indices()[i + 1] - cols = algo_lim.sim_matrix_.col_indices()[sp:ep] - diffs = np.diff(cols.numpy()) + sp = algo_lim.sim_matrix_.indptr[i] + ep = algo_lim.sim_matrix_.indptr[i + 1] + cols = algo_lim.sim_matrix_.indices[sp:ep] + diffs = np.diff(cols) if np.any(diffs <= 0): _log.error("row %d: %d non-sorted indices", i, np.sum(diffs <= 0)) (bad,) = np.nonzero(diffs <= 0) @@ -271,18 +286,18 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): _log.info("checking a sample of neighborhoods") items = algo_ub.items_.ids() - items = items[algo_ub.item_counts_.numpy() > 0] + items = items[algo_ub.item_counts_ > 0] for i in rng.choice(items, 50): ipos = algo_ub.items_.number(i) _log.debug("checking item %d at position %d", i, ipos) assert ipos == algo_lim.items_.number(i) irates = mc_rates.loc[[i], :].set_index("user_id").rating - ub_row = mat_ub[ipos] - b_row = mat_lim[ipos] - assert len(b_row.values()) <= MODEL_SIZE - ub_cols = ub_row.indices()[0].numpy() - b_cols = b_row.indices()[0].numpy() + ub_row = mat_ub[[ipos]] + b_row = mat_lim[[ipos]] + assert len(b_row.data) <= MODEL_SIZE + ub_cols = ub_row.indices + b_cols = b_row.indices _log.debug("kept %d of %d neighbors", len(b_cols), len(ub_cols)) _log.debug("checking for sorted indices") @@ -294,7 +309,7 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): present = np.isin(b_cols, ub_cols) if not np.all(present): _log.error("missing items: %s", b_cols[~present]) - _log.error("scores: %s", b_row.values()[~present]) # type: ignore + _log.error("scores: %s", b_row.data[~present]) # type: ignore raise AssertionError(f"missing {np.sum(~present)} values from unbounded") # spot-check some similarities @@ -304,18 +319,18 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): n_rates = mc_rates.loc[n_id, :].set_index("user_id").rating ir, nr = irates.align(n_rates, fill_value=0) cor = ir.corr(nr) - assert mat_ub[ipos, n].item() == approx(cor, abs=1.0e-6) + assert mat_ub[ipos, n] == approx(cor, abs=1.0e-6) # short rows are equal if len(b_cols) < MODEL_SIZE: _log.debug("short row of length %d", len(b_cols)) assert len(b_row) == len(ub_row) - assert b_row.values().numpy() == approx(ub_row.values().numpy()) + assert b_row.data == approx(ub_row.data) continue # row is truncated - check that truncation is correct - ub_nbrs = pd.Series(ub_row.values().numpy(), algo_ub.items_.ids(ub_cols)) - b_nbrs = pd.Series(b_row.values().numpy(), algo_lim.items_.ids(b_cols)) + ub_nbrs = pd.Series(ub_row.data, algo_ub.items_.ids(ub_cols)) + b_nbrs = pd.Series(b_row.data, algo_lim.items_.ids(b_cols)) assert len(ub_nbrs) >= len(b_nbrs) assert len(b_nbrs) <= MODEL_SIZE @@ -341,6 +356,7 @@ def test_ii_large_models(rng, ml_ratings, ml_ds): @wantjit @mark.slow +@inference_mode def test_ii_implicit_large(rng, ml_ratings): "Test that implicit-feedback mode works on full test data." _log.info("training model") @@ -354,7 +370,7 @@ def test_ii_implicit_large(rng, ml_ratings): users = rng.choice(ml_ratings["user_id"].unique(), NUSERS) items: Vocabulary = algo.items_ - mat: torch.Tensor = algo.sim_matrix_.to_dense() + mat: NDArray[np.float32] = algo.sim_matrix_.toarray() for user in users: recs = pipe.run("recommender", query=user, n=NRECS) @@ -363,10 +379,10 @@ def test_ii_implicit_large(rng, ml_ratings): assert len(recs) == NRECS urates = ml_ratings[ml_ratings["user_id"] == user] - smat = mat[torch.from_numpy(items.numbers(urates["item_id"].values)), :] + smat = mat[items.numbers(urates["item_id"].values), :] for row in recs.to_df().itertuples(): col = smat[:, items.number(row.item_id)] - top, _is = torch.topk(col, NBRS) + top, _is = torch.topk(torch.from_numpy(col), NBRS) score = top.sum() try: assert row.score == approx(score) @@ -381,6 +397,7 @@ def test_ii_implicit_large(rng, ml_ratings): @wantjit +@inference_mode def test_ii_save_load(tmp_path, ml_ratings, ml_subset): "Save and load a model" original = ItemKNNScorer(30, save_nbrs=500) @@ -398,20 +415,20 @@ def test_ii_save_load(tmp_path, ml_ratings, ml_subset): algo = pickle.load(modf) _log.info("checking model") - assert all(np.logical_not(np.isnan(algo.sim_matrix_.values()))) - assert all(algo.sim_matrix_.values() > 0) + assert all(np.logical_not(np.isnan(algo.sim_matrix_.data))) + assert all(algo.sim_matrix_.data > 0) # a little tolerance - assert all(algo.sim_matrix_.values() < 1 + 1.0e-6) + assert all(algo.sim_matrix_.data < 1 + 1.0e-6) assert all(algo.item_counts_ == original.item_counts_) - assert algo.item_counts_.sum() == len(algo.sim_matrix_.values()) - assert len(algo.sim_matrix_.values()) == len(algo.sim_matrix_.values()) - assert all(algo.sim_matrix_.crow_indices() == original.sim_matrix_.crow_indices()) - assert algo.sim_matrix_.values() == approx(original.sim_matrix_.values()) + assert algo.item_counts_.sum() == len(algo.sim_matrix_.data) + assert len(algo.sim_matrix_.data) == len(algo.sim_matrix_.data) + assert all(algo.sim_matrix_.indptr == original.sim_matrix_.indptr) + assert algo.sim_matrix_.data == approx(original.sim_matrix_.data) r_mat = algo.sim_matrix_ o_mat = original.sim_matrix_ - assert all(r_mat.crow_indices() == o_mat.crow_indices()) + assert all(r_mat.indptr == o_mat.indptr) means = ml_ratings.groupby("item_id").rating.mean() assert means[algo.items_.ids()].values == approx(original.item_means_) @@ -441,8 +458,9 @@ def test_ii_known_preds(ml_ds): from lenskit import batch iknn = ItemKNNScorer(20, min_sim=1.0e-6) + pipe = topn_pipeline(iknn) _log.info("training %s on ml data", iknn) - iknn.train(ml_ds) + pipe.train(ml_ds) _log.info("model means: %s", iknn.item_means_) dir = Path(__file__).parent @@ -451,7 +469,7 @@ def test_ii_known_preds(ml_ds): known_preds = pd.read_csv(str(pred_file)) preds = { - user: iknn(user, ItemList(kps, prediction=False)) + user: score(pipe, query=user, items=ItemList(kps, prediction=False)) for (user, kps) in iter_item_lists(known_preds) } preds = dict_to_df(preds)