lenskit · mdekstrand · Jul 15, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 13, 2024
diff --git a/docs/conf.py b/docs/conf.py
@@ -81,6 +81,11 @@
 # how do we want to set up documentation?
 autodoc_default_options = {"members": True, "member-order": "bysource", "show-inheritance": True}
 autodoc_typehints = "description"
+autodoc_type_aliases = {
+    "Iterable": "Iterable",
+    "ArrayLike": "ArrayLike",
+}
+
 todo_include_todos = True
 
 # Cross-linking and external references
@@ -98,6 +103,11 @@
     "implicit": ("https://benfred.github.io/implicit/", None),
 }
 
+bibtex_bibfiles = ["lenskit.bib"]
+jupyter_execute_notebooks = "off"
+
+# -- external links
+
 extlinks = {
     "issue": ("https://github.com/lenskit/lkpy/issues/%s", "🐞 %s"),
     "pr": ("https://github.com/lenskit/lkpy/pull/%s", "⛙ %s"),

diff --git a/docs/data.rst b/docs/data.rst
@@ -1,9 +1,81 @@
-Data Utilities
---------------
+Data Management
+===============
 
 .. module:: lenskit.data
 
-These are general-purpose data processing utilities.
+LensKit provides a unified data model for recommender systems data along with
+classes and utility functions for working with it, described in this section of
+the manual.
+
+
+.. versionchanged:: 2024.1
+    The new :class:`Dataset` class replaces the Pandas data frames
+    that were passed to algorithms in the past.  It also subsumes
+    the old support for producing sparse matrices from rating rames.
+
+.. _data-model:
+
+Data Model and Key Concepts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LensKit data model consists of **users**, **items**, and **interactions**,
+with fields providing additional (optional) data about each of these entities.
+The simplest valid LensKit data set is simply a list of user and item
+identifiers indicating which items each user has interacted with.  These may be
+augumented with ratings, timestamps, or any other attributes.
+
+Data can be read from a range of sources, but ultimately resolves to a
+collection of tables (e.g. Pandas :class:`~pandas.DataFrame`) that record user,
+item, and interaction data.
+
+.. _data-identifiers:
+
+Identifiers
+-----------
+
+Users and items have two identifiers:
+
+* The *identifier* as presented in the original source table(s).  It appears in
+  LensKit data frames as ``user_id`` and ``item_id`` columns.  Identifiers can
+  be integers, strings, or byte arrays.
+* The *number* assigned by the dataset handling code.  This is a 0-based
+  contiguous user or item number that is suitable for indexing into arrays or
+  matrices, a common operation in recommendation models.  In data frames, this
+  appears as a ``user_num`` or ``item_num`` column.  It is the only
+  representation supported by NumPy and PyTorch array formats.
+
+  User and item numbers are assigned based on sorted identifiers in the initial
+  data source, so reloading the same data set will yield the same numbers.
+  Loading a subset, however, is not guaranteed to result in the same numbers, as
+  the subset may be missing some users or items.
+
+  Methods that add additional users or items will assign numbers based on the
+  sorted identifiers that do not yet have numbers.
+
+Identifiers and numbers can be mapped to each other with the user and item
+*vocabularies* (:attr:`~Dataset.user_vocab` and :attr:`~Dataset.item_vocab`), as
+well as convenience methods.
+
+.. autodata:: lenskit.data.vocab.EntityId
+
+.. _dataset:
+
+Dataset Abstraction
+~~~~~~~~~~~~~~~~~~~
+
+The LensKit :class:`Dataset` class is the standard LensKit interface to datasets
+for training, evaluation, etc. Trainable models and components expect a dataset
+instance to be passed to :meth:`~lenskit.algorithms.Recommender.fit`.
+
+.. autoclass:: Dataset
+
+User-Item Data Tables
+~~~~~~~~~~~~~~~~~~~~~
+
+.. module:: lenskit.data.tables
+
+.. autoclass:: NumpyUserItemTable
+.. autoclass:: TorchUserItemTable
 
 Building Ratings Matrices
 ~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/index.rst b/docs/index.rst
@@ -35,15 +35,16 @@ Resources
    releases/index
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Running Experiments
-
-   datasets
-   crossfold
-   batch
-   evaluation/index
-   documenting
-   parallel
+    :maxdepth: 2
+    :caption: Running Experiments
+
+    data
+    datasets
+    crossfold
+    batch
+    evaluation/index
+    documenting
+    parallel
 
 .. toctree::
     :maxdepth: 1

diff --git a/docs/util.rst b/docs/util.rst
@@ -4,7 +4,6 @@ Utility Functions
 These utility functions are useful for data processing.
 
 .. toctree::
-    data
     math
 
 Miscellaneous

diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
@@ -6,6 +6,10 @@
 
 from typing import Literal, TypeAlias
 
-from .matrix import RatingMatrix, sparse_ratings  # noqa: F401
+from .vocab import EntityId, Vocabulary  # noqa: F401, E402
 
 FeedbackType: TypeAlias = Literal["explicit", "implicit"]
+"Types of feedback supported."
+
+from .dataset import Dataset, from_interactions_df  # noqa: F401, E402
+from .matrix import RatingMatrix, sparse_ratings  # noqa: F401, E402
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ Utility Functions @@
     These utility functions are useful for data processing.
     .. toctree::
-        data
         math
     Miscellaneous
@@ Expand Down @@