From 3e8132b00c99b57d62e8b487d4bdace7bef9a19c Mon Sep 17 00:00:00 2001
From: Samuel Stevens <samuel.robert.stevens@gmail.com>
Date: Fri, 25 Oct 2024 09:46:05 -0400
Subject: [PATCH] Updating docs

---
 docs/saev/config.html   | 141 +++++++++++++-----
 docs/saev/index.html    |  60 +++++---
 docs/saev/modeling.html | 320 +++++++++++++++-------------------------
 docs/saev/sessions.html |  57 -------
 docs/saev/vits.html     |  57 -------
 saev/__init__.py        |   6 +
 saev/sessions.py        |   1 -
 saev/vits.py            |   1 -
 8 files changed, 260 insertions(+), 383 deletions(-)
 delete mode 100644 docs/saev/sessions.html
 delete mode 100644 docs/saev/vits.html
 delete mode 100644 saev/sessions.py
 delete mode 100644 saev/vits.py
diff --git a/docs/saev/config.html b/docs/saev/config.html
index 16ecaea..e731249 100644
--- a/docs/saev/config.html
+++ b/docs/saev/config.html
@@ -39,7 +39,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
 <dl>
 <dt id="saev.config.Config"><code class="flex name class">
 <span>class <span class="ident">Config</span></span>
-<span>(</span><span>image_width: int = 224, image_height: int = 224, model: str = 'ViT-L-14/openai', module_name: str = 'resid', block_layer: int = -2, data: <a title="saev.config.Huggingface" href="#saev.config.Huggingface">Huggingface</a> | <a title="saev.config.Webdataset" href="#saev.config.Webdataset">Webdataset</a> = &lt;factory&gt;, n_workers: int = 8, d_in: int = 1024, n_epochs: int = 3, n_batches_in_store: int = 15, vit_batch_size: int = 1024, expansion_factor: int = 64, l1_coefficient: float = 8e-05, lr: float = 0.0004, lr_warm_up_steps: int = 500, batch_size: int = 1024, use_ghost_grads: bool = True, feature_sampling_window: int = 64, resample_batches: int = 32, feature_reinit_scale: float = 0.2, dead_feature_window: int = 64, dead_feature_estimation_method: str = 'no_fire', dead_feature_threshold: float = 1e-06, log_to_wandb: bool = True, wandb_project: str = 'saev', wandb_log_freq: int = 10, device: str = 'cuda', seed: int = 42, dtype: str = 'float32', checkpoint_path: str = 'checkpoints')</span>
+<span>(</span><span>image_width: int = 224, image_height: int = 224, model: str = 'ViT-L-14/openai', module_name: str = 'resid', block_layer: int = -2, data: <a title="saev.config.Imagenet" href="#saev.config.Imagenet">Imagenet</a> | <a title="saev.config.TreeOfLife" href="#saev.config.TreeOfLife">TreeOfLife</a> = &lt;factory&gt;, n_workers: int = 8, d_vit: int = 1024, n_epochs: int = 3, n_batches_in_store: int = 15, vit_batch_size: int = 1024, expansion_factor: int = 64, l1_coefficient: float = 8e-05, lr: float = 0.0004, lr_warm_up_steps: int = 500, batch_size: int = 1024, use_ghost_grads: bool = True, feature_sampling_window: int = 64, resample_batches: int = 32, feature_reinit_scale: float = 0.2, dead_feature_window: int = 64, dead_feature_estimation_method: str = 'no_fire', dead_feature_threshold: float = 1e-06, log_to_wandb: bool = True, wandb_project: str = 'saev', wandb_log_freq: int = 10, device: str = 'cuda', seed: int = 42, dtype: str = 'float32', checkpoint_path: str = 'checkpoints', slurm: bool = False, slurm_acct: str = 'PAS2136', log_to: str = './logs')</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Configuration for training a sparse autoencoder on a vision transformer.</p></div>
@@ -58,14 +58,16 @@ <h2 class="section-title" id="header-classes">Classes</h2>
     image_width: int = 224
     image_height: int = 224
     model: str = &#34;ViT-L-14/openai&#34;
+    &#34;&#34;&#34;Model string, for use with open_clip.&#34;&#34;&#34;
     module_name: str = &#34;resid&#34;
     block_layer: int = -2
-    data: Huggingface | Webdataset = dataclasses.field(default_factory=Huggingface)
+    data: Imagenet | TreeOfLife = dataclasses.field(default_factory=Imagenet)
+    &#34;&#34;&#34;Which dataset to use.&#34;&#34;&#34;
     n_workers: int = 8
     &#34;&#34;&#34;Number of dataloader workers.&#34;&#34;&#34;
 
     # SAE Parameters
-    d_in: int = 1024
+    d_vit: int = 1024
 
     # Activation Store Parameters
     n_epochs: int = 3
@@ -101,13 +103,19 @@ <h2 class="section-title" id="header-classes">Classes</h2>
     dtype: str = &#34;float32&#34;
     checkpoint_path: str = &#34;checkpoints&#34;
 
+    slurm: bool = False
+    &#34;&#34;&#34;Whether to use submitit to run jobs on a slurm cluster.&#34;&#34;&#34;
+    slurm_acct: str = &#34;PAS2136&#34;
+    &#34;&#34;&#34;Slurm account string.&#34;&#34;&#34;
+    log_to: str = &#34;./logs&#34;
+
     @property
     def store_size(self) -&gt; int:
         return self.n_batches_in_store * self.batch_size
 
     @property
     def d_sae(self) -&gt; int:
-        return self.d_in * self.expansion_factor
+        return self.d_vit * self.expansion_factor
 
     @property
     def run_name(self) -&gt; str:
@@ -129,13 +137,13 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt id="saev.config.Config.d_in"><code class="name">var <span class="ident">d_in</span> : int</code></dt>
+<dt id="saev.config.Config.d_vit"><code class="name">var <span class="ident">d_vit</span> : int</code></dt>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt id="saev.config.Config.data"><code class="name">var <span class="ident">data</span> : <a title="saev.config.Huggingface" href="#saev.config.Huggingface">Huggingface</a> | <a title="saev.config.Webdataset" href="#saev.config.Webdataset">Webdataset</a></code></dt>
+<dt id="saev.config.Config.data"><code class="name">var <span class="ident">data</span> : <a title="saev.config.Imagenet" href="#saev.config.Imagenet">Imagenet</a> | <a title="saev.config.TreeOfLife" href="#saev.config.TreeOfLife">TreeOfLife</a></code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Which dataset to use.</p></div>
 </dd>
 <dt id="saev.config.Config.dead_feature_estimation_method"><code class="name">var <span class="ident">dead_feature_estimation_method</span> : str</code></dt>
 <dd>
@@ -181,6 +189,10 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
+<dt id="saev.config.Config.log_to"><code class="name">var <span class="ident">log_to</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
 <dt id="saev.config.Config.log_to_wandb"><code class="name">var <span class="ident">log_to_wandb</span> : bool</code></dt>
 <dd>
 <div class="desc"></div>
@@ -195,7 +207,7 @@ <h3>Class variables</h3>
 </dd>
 <dt id="saev.config.Config.model"><code class="name">var <span class="ident">model</span> : str</code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Model string, for use with open_clip.</p></div>
 </dd>
 <dt id="saev.config.Config.module_name"><code class="name">var <span class="ident">module_name</span> : str</code></dt>
 <dd>
@@ -221,6 +233,14 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
+<dt id="saev.config.Config.slurm"><code class="name">var <span class="ident">slurm</span> : bool</code></dt>
+<dd>
+<div class="desc"><p>Whether to use submitit to run jobs on a slurm cluster.</p></div>
+</dd>
+<dt id="saev.config.Config.slurm_acct"><code class="name">var <span class="ident">slurm_acct</span> : str</code></dt>
+<dd>
+<div class="desc"><p>Slurm account string.</p></div>
+</dd>
 <dt id="saev.config.Config.use_ghost_grads"><code class="name">var <span class="ident">use_ghost_grads</span> : bool</code></dt>
 <dd>
 <div class="desc"></div>
@@ -249,7 +269,7 @@ <h3>Instance variables</h3>
 </summary>
 <pre><code class="python">@property
 def d_sae(self) -&gt; int:
-    return self.d_in * self.expansion_factor</code></pre>
+    return self.d_vit * self.expansion_factor</code></pre>
 </details>
 </dd>
 <dt id="saev.config.Config.run_name"><code class="name">prop <span class="ident">run_name</span> : str</code></dt>
@@ -280,22 +300,23 @@ <h3>Instance variables</h3>
 </dd>
 </dl>
 </dd>
-<dt id="saev.config.Huggingface"><code class="flex name class">
-<span>class <span class="ident">Huggingface</span></span>
+<dt id="saev.config.Imagenet"><code class="flex name class">
+<span>class <span class="ident">Imagenet</span></span>
 <span>(</span><span>name: str = 'ILSVRC/imagenet-1k')</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Configuration for datasets from HuggingFace.</p></div>
+<div class="desc"><p>Configuration for HuggingFace Imagenet.</p></div>
 <details class="source">
 <summary>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">@beartype.beartype
 @dataclasses.dataclass(frozen=True)
-class Huggingface:
-    &#34;&#34;&#34;Configuration for datasets from HuggingFace.&#34;&#34;&#34;
+class Imagenet:
+    &#34;&#34;&#34;Configuration for HuggingFace Imagenet.&#34;&#34;&#34;
 
     name: str = &#34;ILSVRC/imagenet-1k&#34;
+    &#34;&#34;&#34;Dataset name. Probably don&#39;t want to change this.&#34;&#34;&#34;
 
     @property
     def n_imgs(self) -&gt; int:
@@ -308,14 +329,14 @@ <h3>Instance variables</h3>
 </details>
 <h3>Class variables</h3>
 <dl>
-<dt id="saev.config.Huggingface.name"><code class="name">var <span class="ident">name</span> : str</code></dt>
+<dt id="saev.config.Imagenet.name"><code class="name">var <span class="ident">name</span> : str</code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Dataset name. Probably don't want to change this.</p></div>
 </dd>
 </dl>
 <h3>Instance variables</h3>
 <dl>
-<dt id="saev.config.Huggingface.n_imgs"><code class="name">prop <span class="ident">n_imgs</span> : int</code></dt>
+<dt id="saev.config.Imagenet.n_imgs"><code class="name">prop <span class="ident">n_imgs</span> : int</code></dt>
 <dd>
 <div class="desc"></div>
 <details class="source">
@@ -334,27 +355,27 @@ <h3>Instance variables</h3>
 </dd>
 </dl>
 </dd>
-<dt id="saev.config.Webdataset"><code class="flex name class">
-<span>class <span class="ident">Webdataset</span></span>
-<span>(</span><span>url: str = '/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar', n_imgs: int = 9562377)</span>
+<dt id="saev.config.TreeOfLife"><code class="flex name class">
+<span>class <span class="ident">TreeOfLife</span></span>
+<span>(</span><span>metadata: str = 'treeoflife-10m.json', label_key: str = '.taxonomic_name.txt')</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Configuration for webdataset (like TreeOfLife-10M).</p>
+<div class="desc"><p>Configuration for the TreeOfLife-10M webdataset.</p>
 <p>Webdatasets are designed for random sampling of the entire dataset so that over multiple epochs, every sample is seen, on average, the same number of times. However, for training sparse autoencoders, we need to calculate ViT activations exactly once for each example in the dataset. Webdatasets support this through the <a href="https://github.com/webdataset/webdataset?tab=readme-ov-file#the-wids-library-for-indexed-webdatasets"><code>wids</code></a> library.</p>
 <p>Here is a short discussion of the steps required to use saev with webdatasets.</p>
 <p>First, you will need to use <code>widsindex</code> (installed with the webdataset library) to create an metadata file used by wids. You can see an example file <a href="https://storage.googleapis.com/webdataset/fake-imagenet/imagenet-train.json">here</a>. To generate my own metadata file, I ran this command:</p>
-<pre><code>uv run widsindex create --name treeoflife-10m --output meta.json '/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar'
+<pre><code>uv run widsindex create         --name treeoflife-10m         --output treeoflife-10m.json         '/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar'
 </code></pre>
-<p>It took a long time (more than an hour) and generated a <code>meta.json</code> file.</p></div>
+<p>It took a long time (more than an hour, less than 3 hours) and generated a <code>treeoflife-10m.json</code> file.</p></div>
 <details class="source">
 <summary>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">@beartype.beartype
 @dataclasses.dataclass(frozen=True)
-class Webdataset:
+class TreeOfLife:
     &#34;&#34;&#34;
-    Configuration for webdataset (like TreeOfLife-10M).
+    Configuration for the TreeOfLife-10M webdataset.
 
     Webdatasets are designed for random sampling of the entire dataset so that over multiple epochs, every sample is seen, on average, the same number of times. However, for training sparse autoencoders, we need to calculate ViT activations exactly once for each example in the dataset. Webdatasets support this through the [`wids`](https://github.com/webdataset/webdataset?tab=readme-ov-file#the-wids-library-for-indexed-webdatasets) library.
 
@@ -363,28 +384,64 @@ <h3>Instance variables</h3>
     First, you will need to use `widsindex` (installed with the webdataset library) to create an metadata file used by wids. You can see an example file [here](https://storage.googleapis.com/webdataset/fake-imagenet/imagenet-train.json). To generate my own metadata file, I ran this command:
 
     ```
-    uv run widsindex create --name treeoflife-10m --output meta.json &#39;/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar&#39;
+    uv run widsindex create \
+        --name treeoflife-10m \
+        --output treeoflife-10m.json \
+        &#39;/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar&#39;
     ```
 
-    It took a long time (more than an hour) and generated a `meta.json` file.
+    It took a long time (more than an hour, less than 3 hours) and generated a `treeoflife-10m.json` file.
     &#34;&#34;&#34;
 
-    url: str = &#34;/fs/ess/PAS2136/open_clip/data/evobio10m-v3.3/224x224/train/shard-{000000..000159}.tar&#34;
+    metadata: str = &#34;treeoflife-10m.json&#34;
     &#34;&#34;&#34;Path to dataset shards.&#34;&#34;&#34;
-    n_imgs: int = 9562377
-    &#34;&#34;&#34;Number of images in dataset.&#34;&#34;&#34;</code></pre>
+    label_key: str = &#34;.taxonomic_name.txt&#34;
+    &#34;&#34;&#34;Which key to use as the label.&#34;&#34;&#34;
+
+    @property
+    def n_imgs(self) -&gt; int:
+        with open(self.metadata) as fd:
+            metadata = json.load(fd)
+
+        return (
+            np.array([shard[&#34;nsamples&#34;] for shard in metadata[&#34;shardlist&#34;]])
+            .sum()
+            .item()
+        )</code></pre>
 </details>
 <h3>Class variables</h3>
 <dl>
-<dt id="saev.config.Webdataset.n_imgs"><code class="name">var <span class="ident">n_imgs</span> : int</code></dt>
+<dt id="saev.config.TreeOfLife.label_key"><code class="name">var <span class="ident">label_key</span> : str</code></dt>
 <dd>
-<div class="desc"><p>Number of images in dataset.</p></div>
+<div class="desc"><p>Which key to use as the label.</p></div>
 </dd>
-<dt id="saev.config.Webdataset.url"><code class="name">var <span class="ident">url</span> : str</code></dt>
+<dt id="saev.config.TreeOfLife.metadata"><code class="name">var <span class="ident">metadata</span> : str</code></dt>
 <dd>
 <div class="desc"><p>Path to dataset shards.</p></div>
 </dd>
 </dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="saev.config.TreeOfLife.n_imgs"><code class="name">prop <span class="ident">n_imgs</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def n_imgs(self) -&gt; int:
+    with open(self.metadata) as fd:
+        metadata = json.load(fd)
+
+    return (
+        np.array([shard[&#34;nsamples&#34;] for shard in metadata[&#34;shardlist&#34;]])
+        .sum()
+        .item()
+    )</code></pre>
+</details>
+</dd>
+</dl>
 </dd>
 </dl>
 </section>
@@ -407,8 +464,8 @@ <h4><code><a title="saev.config.Config" href="#saev.config.Config">Config</a></c
 <li><code><a title="saev.config.Config.batch_size" href="#saev.config.Config.batch_size">batch_size</a></code></li>
 <li><code><a title="saev.config.Config.block_layer" href="#saev.config.Config.block_layer">block_layer</a></code></li>
 <li><code><a title="saev.config.Config.checkpoint_path" href="#saev.config.Config.checkpoint_path">checkpoint_path</a></code></li>
-<li><code><a title="saev.config.Config.d_in" href="#saev.config.Config.d_in">d_in</a></code></li>
 <li><code><a title="saev.config.Config.d_sae" href="#saev.config.Config.d_sae">d_sae</a></code></li>
+<li><code><a title="saev.config.Config.d_vit" href="#saev.config.Config.d_vit">d_vit</a></code></li>
 <li><code><a title="saev.config.Config.data" href="#saev.config.Config.data">data</a></code></li>
 <li><code><a title="saev.config.Config.dead_feature_estimation_method" href="#saev.config.Config.dead_feature_estimation_method">dead_feature_estimation_method</a></code></li>
 <li><code><a title="saev.config.Config.dead_feature_threshold" href="#saev.config.Config.dead_feature_threshold">dead_feature_threshold</a></code></li>
@@ -421,6 +478,7 @@ <h4><code><a title="saev.config.Config" href="#saev.config.Config">Config</a></c
 <li><code><a title="saev.config.Config.image_height" href="#saev.config.Config.image_height">image_height</a></code></li>
 <li><code><a title="saev.config.Config.image_width" href="#saev.config.Config.image_width">image_width</a></code></li>
 <li><code><a title="saev.config.Config.l1_coefficient" href="#saev.config.Config.l1_coefficient">l1_coefficient</a></code></li>
+<li><code><a title="saev.config.Config.log_to" href="#saev.config.Config.log_to">log_to</a></code></li>
 <li><code><a title="saev.config.Config.log_to_wandb" href="#saev.config.Config.log_to_wandb">log_to_wandb</a></code></li>
 <li><code><a title="saev.config.Config.lr" href="#saev.config.Config.lr">lr</a></code></li>
 <li><code><a title="saev.config.Config.lr_warm_up_steps" href="#saev.config.Config.lr_warm_up_steps">lr_warm_up_steps</a></code></li>
@@ -432,6 +490,8 @@ <h4><code><a title="saev.config.Config" href="#saev.config.Config">Config</a></c
 <li><code><a title="saev.config.Config.resample_batches" href="#saev.config.Config.resample_batches">resample_batches</a></code></li>
 <li><code><a title="saev.config.Config.run_name" href="#saev.config.Config.run_name">run_name</a></code></li>
 <li><code><a title="saev.config.Config.seed" href="#saev.config.Config.seed">seed</a></code></li>
+<li><code><a title="saev.config.Config.slurm" href="#saev.config.Config.slurm">slurm</a></code></li>
+<li><code><a title="saev.config.Config.slurm_acct" href="#saev.config.Config.slurm_acct">slurm_acct</a></code></li>
 <li><code><a title="saev.config.Config.store_size" href="#saev.config.Config.store_size">store_size</a></code></li>
 <li><code><a title="saev.config.Config.use_ghost_grads" href="#saev.config.Config.use_ghost_grads">use_ghost_grads</a></code></li>
 <li><code><a title="saev.config.Config.vit_batch_size" href="#saev.config.Config.vit_batch_size">vit_batch_size</a></code></li>
@@ -440,17 +500,18 @@ <h4><code><a title="saev.config.Config" href="#saev.config.Config">Config</a></c
 </ul>
 </li>
 <li>
-<h4><code><a title="saev.config.Huggingface" href="#saev.config.Huggingface">Huggingface</a></code></h4>
+<h4><code><a title="saev.config.Imagenet" href="#saev.config.Imagenet">Imagenet</a></code></h4>
 <ul class="">
-<li><code><a title="saev.config.Huggingface.n_imgs" href="#saev.config.Huggingface.n_imgs">n_imgs</a></code></li>
-<li><code><a title="saev.config.Huggingface.name" href="#saev.config.Huggingface.name">name</a></code></li>
+<li><code><a title="saev.config.Imagenet.n_imgs" href="#saev.config.Imagenet.n_imgs">n_imgs</a></code></li>
+<li><code><a title="saev.config.Imagenet.name" href="#saev.config.Imagenet.name">name</a></code></li>
 </ul>
 </li>
 <li>
-<h4><code><a title="saev.config.Webdataset" href="#saev.config.Webdataset">Webdataset</a></code></h4>
+<h4><code><a title="saev.config.TreeOfLife" href="#saev.config.TreeOfLife">TreeOfLife</a></code></h4>
 <ul class="">
-<li><code><a title="saev.config.Webdataset.n_imgs" href="#saev.config.Webdataset.n_imgs">n_imgs</a></code></li>
-<li><code><a title="saev.config.Webdataset.url" href="#saev.config.Webdataset.url">url</a></code></li>
+<li><code><a title="saev.config.TreeOfLife.label_key" href="#saev.config.TreeOfLife.label_key">label_key</a></code></li>
+<li><code><a title="saev.config.TreeOfLife.metadata" href="#saev.config.TreeOfLife.metadata">metadata</a></code></li>
+<li><code><a title="saev.config.TreeOfLife.n_imgs" href="#saev.config.TreeOfLife.n_imgs">n_imgs</a></code></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/saev/index.html b/docs/saev/index.html
index 6613ecd..f8376f4 100644
--- a/docs/saev/index.html
+++ b/docs/saev/index.html
@@ -5,7 +5,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
 <meta name="generator" content="pdoc3 0.11.1">
 <title>saev API documentation</title>
-<meta name="description" content="">
+<meta name="description" content="saev is a Python package for training sparse autoencoders (SAEs) on vision transformers (ViTs) in PyTorch …">
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
@@ -27,6 +27,8 @@
 <h1 class="title">Package <code>saev</code></h1>
 </header>
 <section id="section-intro">
+<p>saev is a Python package for training sparse autoencoders (SAEs) on vision transformers (ViTs) in PyTorch.</p>
+<p>The main entrypoint to the package is in <a href="https://github.com/samuelstevens/saev/blob/main/main.py">main.py</a>; use <code>python main.py --help</code> to see the options and documentation for the script.</p>
 </section>
 <section>
 <h2 class="section-title" id="header-submodules">Sub-modules</h2>
@@ -45,20 +47,13 @@ <h2 class="section-title" id="header-submodules">Sub-modules</h2>
 </dd>
 <dt><code class="name"><a title="saev.modeling" href="modeling.html">saev.modeling</a></code></dt>
 <dd>
-<div class="desc"></div>
-</dd>
-<dt><code class="name"><a title="saev.sessions" href="sessions.html">saev.sessions</a></code></dt>
-<dd>
-<div class="desc"></div>
+<div class="desc"><p>modeling is the main module for the saev package and contains all the important non-config classes.
+It's fine for this package to be slow to import …</p></div>
 </dd>
 <dt><code class="name"><a title="saev.training" href="training.html">saev.training</a></code></dt>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt><code class="name"><a title="saev.vits" href="vits.html">saev.vits</a></code></dt>
-<dd>
-<div class="desc"></div>
-</dd>
 <dt><code class="name"><a title="saev.webapp" href="webapp.html">saev.webapp</a></code></dt>
 <dd>
 <div class="desc"></div>
@@ -74,7 +69,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
 <dl>
 <dt id="saev.Config"><code class="flex name class">
 <span>class <span class="ident">Config</span></span>
-<span>(</span><span>image_width: int = 224, image_height: int = 224, model: str = 'ViT-L-14/openai', module_name: str = 'resid', block_layer: int = -2, data: <a title="saev.config.Huggingface" href="config.html#saev.config.Huggingface">Huggingface</a> | <a title="saev.config.Webdataset" href="config.html#saev.config.Webdataset">Webdataset</a> = &lt;factory&gt;, n_workers: int = 8, d_in: int = 1024, n_epochs: int = 3, n_batches_in_store: int = 15, vit_batch_size: int = 1024, expansion_factor: int = 64, l1_coefficient: float = 8e-05, lr: float = 0.0004, lr_warm_up_steps: int = 500, batch_size: int = 1024, use_ghost_grads: bool = True, feature_sampling_window: int = 64, resample_batches: int = 32, feature_reinit_scale: float = 0.2, dead_feature_window: int = 64, dead_feature_estimation_method: str = 'no_fire', dead_feature_threshold: float = 1e-06, log_to_wandb: bool = True, wandb_project: str = 'saev', wandb_log_freq: int = 10, device: str = 'cuda', seed: int = 42, dtype: str = 'float32', checkpoint_path: str = 'checkpoints')</span>
+<span>(</span><span>image_width: int = 224, image_height: int = 224, model: str = 'ViT-L-14/openai', module_name: str = 'resid', block_layer: int = -2, data: <a title="saev.config.Imagenet" href="config.html#saev.config.Imagenet">Imagenet</a> | <a title="saev.config.TreeOfLife" href="config.html#saev.config.TreeOfLife">TreeOfLife</a> = &lt;factory&gt;, n_workers: int = 8, d_vit: int = 1024, n_epochs: int = 3, n_batches_in_store: int = 15, vit_batch_size: int = 1024, expansion_factor: int = 64, l1_coefficient: float = 8e-05, lr: float = 0.0004, lr_warm_up_steps: int = 500, batch_size: int = 1024, use_ghost_grads: bool = True, feature_sampling_window: int = 64, resample_batches: int = 32, feature_reinit_scale: float = 0.2, dead_feature_window: int = 64, dead_feature_estimation_method: str = 'no_fire', dead_feature_threshold: float = 1e-06, log_to_wandb: bool = True, wandb_project: str = 'saev', wandb_log_freq: int = 10, device: str = 'cuda', seed: int = 42, dtype: str = 'float32', checkpoint_path: str = 'checkpoints', slurm: bool = False, slurm_acct: str = 'PAS2136', log_to: str = './logs')</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Configuration for training a sparse autoencoder on a vision transformer.</p></div>
@@ -93,14 +88,16 @@ <h2 class="section-title" id="header-classes">Classes</h2>
     image_width: int = 224
     image_height: int = 224
     model: str = &#34;ViT-L-14/openai&#34;
+    &#34;&#34;&#34;Model string, for use with open_clip.&#34;&#34;&#34;
     module_name: str = &#34;resid&#34;
     block_layer: int = -2
-    data: Huggingface | Webdataset = dataclasses.field(default_factory=Huggingface)
+    data: Imagenet | TreeOfLife = dataclasses.field(default_factory=Imagenet)
+    &#34;&#34;&#34;Which dataset to use.&#34;&#34;&#34;
     n_workers: int = 8
     &#34;&#34;&#34;Number of dataloader workers.&#34;&#34;&#34;
 
     # SAE Parameters
-    d_in: int = 1024
+    d_vit: int = 1024
 
     # Activation Store Parameters
     n_epochs: int = 3
@@ -136,13 +133,19 @@ <h2 class="section-title" id="header-classes">Classes</h2>
     dtype: str = &#34;float32&#34;
     checkpoint_path: str = &#34;checkpoints&#34;
 
+    slurm: bool = False
+    &#34;&#34;&#34;Whether to use submitit to run jobs on a slurm cluster.&#34;&#34;&#34;
+    slurm_acct: str = &#34;PAS2136&#34;
+    &#34;&#34;&#34;Slurm account string.&#34;&#34;&#34;
+    log_to: str = &#34;./logs&#34;
+
     @property
     def store_size(self) -&gt; int:
         return self.n_batches_in_store * self.batch_size
 
     @property
     def d_sae(self) -&gt; int:
-        return self.d_in * self.expansion_factor
+        return self.d_vit * self.expansion_factor
 
     @property
     def run_name(self) -&gt; str:
@@ -164,13 +167,13 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt id="saev.Config.d_in"><code class="name">var <span class="ident">d_in</span> : int</code></dt>
+<dt id="saev.Config.d_vit"><code class="name">var <span class="ident">d_vit</span> : int</code></dt>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt id="saev.Config.data"><code class="name">var <span class="ident">data</span> : <a title="saev.config.Huggingface" href="config.html#saev.config.Huggingface">Huggingface</a> | <a title="saev.config.Webdataset" href="config.html#saev.config.Webdataset">Webdataset</a></code></dt>
+<dt id="saev.Config.data"><code class="name">var <span class="ident">data</span> : <a title="saev.config.Imagenet" href="config.html#saev.config.Imagenet">Imagenet</a> | <a title="saev.config.TreeOfLife" href="config.html#saev.config.TreeOfLife">TreeOfLife</a></code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Which dataset to use.</p></div>
 </dd>
 <dt id="saev.Config.dead_feature_estimation_method"><code class="name">var <span class="ident">dead_feature_estimation_method</span> : str</code></dt>
 <dd>
@@ -216,6 +219,10 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
+<dt id="saev.Config.log_to"><code class="name">var <span class="ident">log_to</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
 <dt id="saev.Config.log_to_wandb"><code class="name">var <span class="ident">log_to_wandb</span> : bool</code></dt>
 <dd>
 <div class="desc"></div>
@@ -230,7 +237,7 @@ <h3>Class variables</h3>
 </dd>
 <dt id="saev.Config.model"><code class="name">var <span class="ident">model</span> : str</code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Model string, for use with open_clip.</p></div>
 </dd>
 <dt id="saev.Config.module_name"><code class="name">var <span class="ident">module_name</span> : str</code></dt>
 <dd>
@@ -256,6 +263,14 @@ <h3>Class variables</h3>
 <dd>
 <div class="desc"></div>
 </dd>
+<dt id="saev.Config.slurm"><code class="name">var <span class="ident">slurm</span> : bool</code></dt>
+<dd>
+<div class="desc"><p>Whether to use submitit to run jobs on a slurm cluster.</p></div>
+</dd>
+<dt id="saev.Config.slurm_acct"><code class="name">var <span class="ident">slurm_acct</span> : str</code></dt>
+<dd>
+<div class="desc"><p>Slurm account string.</p></div>
+</dd>
 <dt id="saev.Config.use_ghost_grads"><code class="name">var <span class="ident">use_ghost_grads</span> : bool</code></dt>
 <dd>
 <div class="desc"></div>
@@ -284,7 +299,7 @@ <h3>Instance variables</h3>
 </summary>
 <pre><code class="python">@property
 def d_sae(self) -&gt; int:
-    return self.d_in * self.expansion_factor</code></pre>
+    return self.d_vit * self.expansion_factor</code></pre>
 </details>
 </dd>
 <dt id="saev.Config.run_name"><code class="name">prop <span class="ident">run_name</span> : str</code></dt>
@@ -329,9 +344,7 @@ <h3>Instance variables</h3>
 <li><code><a title="saev.config" href="config.html">saev.config</a></code></li>
 <li><code><a title="saev.helpers" href="helpers.html">saev.helpers</a></code></li>
 <li><code><a title="saev.modeling" href="modeling.html">saev.modeling</a></code></li>
-<li><code><a title="saev.sessions" href="sessions.html">saev.sessions</a></code></li>
 <li><code><a title="saev.training" href="training.html">saev.training</a></code></li>
-<li><code><a title="saev.vits" href="vits.html">saev.vits</a></code></li>
 <li><code><a title="saev.webapp" href="webapp.html">saev.webapp</a></code></li>
 </ul>
 </li>
@@ -343,8 +356,8 @@ <h4><code><a title="saev.Config" href="#saev.Config">Config</a></code></h4>
 <li><code><a title="saev.Config.batch_size" href="#saev.Config.batch_size">batch_size</a></code></li>
 <li><code><a title="saev.Config.block_layer" href="#saev.Config.block_layer">block_layer</a></code></li>
 <li><code><a title="saev.Config.checkpoint_path" href="#saev.Config.checkpoint_path">checkpoint_path</a></code></li>
-<li><code><a title="saev.Config.d_in" href="#saev.Config.d_in">d_in</a></code></li>
 <li><code><a title="saev.Config.d_sae" href="#saev.Config.d_sae">d_sae</a></code></li>
+<li><code><a title="saev.Config.d_vit" href="#saev.Config.d_vit">d_vit</a></code></li>
 <li><code><a title="saev.Config.data" href="#saev.Config.data">data</a></code></li>
 <li><code><a title="saev.Config.dead_feature_estimation_method" href="#saev.Config.dead_feature_estimation_method">dead_feature_estimation_method</a></code></li>
 <li><code><a title="saev.Config.dead_feature_threshold" href="#saev.Config.dead_feature_threshold">dead_feature_threshold</a></code></li>
@@ -357,6 +370,7 @@ <h4><code><a title="saev.Config" href="#saev.Config">Config</a></code></h4>
 <li><code><a title="saev.Config.image_height" href="#saev.Config.image_height">image_height</a></code></li>
 <li><code><a title="saev.Config.image_width" href="#saev.Config.image_width">image_width</a></code></li>
 <li><code><a title="saev.Config.l1_coefficient" href="#saev.Config.l1_coefficient">l1_coefficient</a></code></li>
+<li><code><a title="saev.Config.log_to" href="#saev.Config.log_to">log_to</a></code></li>
 <li><code><a title="saev.Config.log_to_wandb" href="#saev.Config.log_to_wandb">log_to_wandb</a></code></li>
 <li><code><a title="saev.Config.lr" href="#saev.Config.lr">lr</a></code></li>
 <li><code><a title="saev.Config.lr_warm_up_steps" href="#saev.Config.lr_warm_up_steps">lr_warm_up_steps</a></code></li>
@@ -368,6 +382,8 @@ <h4><code><a title="saev.Config" href="#saev.Config">Config</a></code></h4>
 <li><code><a title="saev.Config.resample_batches" href="#saev.Config.resample_batches">resample_batches</a></code></li>
 <li><code><a title="saev.Config.run_name" href="#saev.Config.run_name">run_name</a></code></li>
 <li><code><a title="saev.Config.seed" href="#saev.Config.seed">seed</a></code></li>
+<li><code><a title="saev.Config.slurm" href="#saev.Config.slurm">slurm</a></code></li>
+<li><code><a title="saev.Config.slurm_acct" href="#saev.Config.slurm_acct">slurm_acct</a></code></li>
 <li><code><a title="saev.Config.store_size" href="#saev.Config.store_size">store_size</a></code></li>
 <li><code><a title="saev.Config.use_ghost_grads" href="#saev.Config.use_ghost_grads">use_ghost_grads</a></code></li>
 <li><code><a title="saev.Config.vit_batch_size" href="#saev.Config.vit_batch_size">vit_batch_size</a></code></li>
diff --git a/docs/saev/modeling.html b/docs/saev/modeling.html
index 9ed64b1..d22e82c 100644
--- a/docs/saev/modeling.html
+++ b/docs/saev/modeling.html
@@ -5,7 +5,8 @@
 <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
 <meta name="generator" content="pdoc3 0.11.1">
 <title>saev.modeling API documentation</title>
-<meta name="description" content="">
+<meta name="description" content="modeling is the main module for the saev package and contains all the important non-config classes.
+It&#39;s fine for this package to be slow to import …">
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
@@ -27,6 +28,8 @@
 <h1 class="title">Module <code>saev.modeling</code></h1>
 </header>
 <section id="section-intro">
+<p>modeling is the main module for the saev package and contains all the important non-config classes.
+It's fine for this package to be slow to import (see <code><a title="saev.config" href="config.html">saev.config</a></code> for a discussion of import times).</p>
 </section>
 <section>
 </section>
@@ -35,8 +38,8 @@ <h1 class="title">Module <code>saev.modeling</code></h1>
 <section>
 <h2 class="section-title" id="header-functions">Functions</h2>
 <dl>
-<dt id="saev.modeling.filter_no_caption_or_no_image"><code class="name flex">
-<span>def <span class="ident">filter_no_caption_or_no_image</span></span>(<span>sample)</span>
+<dt id="saev.modeling.dump"><code class="name flex">
+<span>def <span class="ident">dump</span></span>(<span>filename: str, model_kwargs: dict[str, object], model: torch.nn.modules.module.Module)</span>
 </code></dt>
 <dd>
 <div class="desc"></div>
@@ -45,7 +48,14 @@ <h2 class="section-title" id="header-functions">Functions</h2>
 <span>def <span class="ident">get_acts_filepath</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>) ‑> str</span>
 </code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Return the activations filepath based on the relevant values of a config.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong></dt>
+<dd>Config for experiment.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>Filepath to where activations should be dumped/loaded from.</p></div>
 </dd>
 <dt id="saev.modeling.get_cache_dir"><code class="name flex">
 <span>def <span class="ident">get_cache_dir</span></span>(<span>) ‑> str</span>
@@ -53,8 +63,8 @@ <h2 class="section-title" id="header-functions">Functions</h2>
 <dd>
 <div class="desc"><p>Get cache directory from environment variables, defaulting to the current working directory (.)</p></div>
 </dd>
-<dt id="saev.modeling.get_hf_dataloader"><code class="name flex">
-<span>def <span class="ident">get_hf_dataloader</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, preprocess) ‑> torch.utils.data.dataloader.DataLoader</span>
+<dt id="saev.modeling.get_imagenet_dataloader"><code class="name flex">
+<span>def <span class="ident">get_imagenet_dataloader</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, preprocess) ‑> torch.utils.data.dataloader.DataLoader</span>
 </code></dt>
 <dd>
 <div class="desc"></div>
@@ -63,19 +73,36 @@ <h2 class="section-title" id="header-functions">Functions</h2>
 <span>def <span class="ident">get_sae_batches</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, acts_store: <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a>) ‑> jaxtyping.Float[Tensor, 'store_size d_model']</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Get a batch of vit activations</p></div>
-</dd>
-<dt id="saev.modeling.get_wds_dataloader"><code class="name flex">
-<span>def <span class="ident">get_wds_dataloader</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, preprocess) ‑> torch.utils.data.dataloader.DataLoader</span>
+<div class="desc"><p>Get a batch of vit activations to re-initialize the SAE.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong></dt>
+<dd>Config.</dd>
+<dt><strong><code>acts_store</code></strong></dt>
+<dd>Activation store.</dd>
+</dl></div>
+</dd>
+<dt id="saev.modeling.get_tol_dataloader"><code class="name flex">
+<span>def <span class="ident">get_tol_dataloader</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, preprocess) ‑> torch.utils.data.dataloader.DataLoader</span>
 </code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Get a dataloader for the TreeOfLife-10M dataset.</p>
+<p>Currently does not include a true index or label in the loaded examples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong></dt>
+<dd>Config.</dd>
+<dt><strong><code>preprocess</code></strong></dt>
+<dd>Image transform to be applied to each image.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>A PyTorch Dataloader that yields dictionaries with <code>'image'</code> keys containing image batches.</p></div>
 </dd>
-<dt id="saev.modeling.log_and_continue"><code class="name flex">
-<span>def <span class="ident">log_and_continue</span></span>(<span>exn)</span>
+<dt id="saev.modeling.load"><code class="name flex">
+<span>def <span class="ident">load</span></span>(<span>filename, cls: type[torch.nn.modules.module.Module]) ‑> torch.nn.modules.module.Module</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Call in an exception handler to ignore any exception, issue a warning, and continue.</p></div>
+<div class="desc"></div>
 </dd>
 <dt id="saev.modeling.save_acts"><code class="name flex">
 <span>def <span class="ident">save_acts</span></span>(<span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, vit: <a title="saev.modeling.RecordedVit" href="#saev.modeling.RecordedVit">RecordedVit</a>)</span>
@@ -136,7 +163,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
         else:
             raise ValueError(f&#34;Invalid value &#39;{on_missing}&#39; for arg &#39;on_missing&#39;.&#34;)
 
-        self.shape = (cfg.data.n_imgs, cfg.d_in)
+        self.shape = (cfg.data.n_imgs, cfg.d_vit)
         # TODO
         # self.labels = torch.tensor(dataset[&#34;label&#34;])
         self.labels = None
@@ -353,16 +380,21 @@ <h3>Methods</h3>
 </dd>
 <dt id="saev.modeling.Session"><code class="flex name class">
 <span>class <span class="ident">Session</span></span>
-<span>(</span><span>vit: <a title="saev.modeling.RecordedVit" href="#saev.modeling.RecordedVit">RecordedVit</a>, sae: <a title="saev.modeling.SparseAutoencoder" href="#saev.modeling.SparseAutoencoder">SparseAutoencoder</a>, acts_store: <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a>)</span>
+<span>(</span><span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, vit: <a title="saev.modeling.RecordedVit" href="#saev.modeling.RecordedVit">RecordedVit</a>, sae: <a title="saev.modeling.SparseAutoencoder" href="#saev.modeling.SparseAutoencoder">SparseAutoencoder</a>, acts_store: <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a>)</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Session(vit, sae, acts_store)</p></div>
+<div class="desc"><p>Session is a group of instances of the main classes for saev experiments.</p></div>
 <details class="source">
 <summary>
 <span>Expand source code</span>
 </summary>
 <pre><code class="python">@beartype.beartype
 class Session(typing.NamedTuple):
+    &#34;&#34;&#34;
+    Session is a group of instances of the main classes for saev experiments.
+    &#34;&#34;&#34;
+
+    cfg: config.Config
     vit: RecordedVit
     sae: SparseAutoencoder
     acts_store: CachedActivationsStore
@@ -378,7 +410,7 @@ <h3>Methods</h3>
         sae = SparseAutoencoder(cfg)
         acts_store = CachedActivationsStore(cfg, vit, on_missing=&#34;error&#34;)
 
-        return cls(vit, sae, acts_store)
+        return cls(cfg, vit, sae, acts_store)
 
     @classmethod
     def from_disk(cls, path) -&gt; &#34;Session&#34;:
@@ -389,7 +421,7 @@ <h3>Methods</h3>
 
         vit, _, acts_store = cls.from_cfg(cfg)
         sae = SparseAutoencoder.load_from_pretrained(path)
-        return cls(vit, sae, acts_store)</code></pre>
+        return cls(cfg, vit, sae, acts_store)</code></pre>
 </details>
 <h3>Ancestors</h3>
 <ul class="hlist">
@@ -414,50 +446,28 @@ <h3>Instance variables</h3>
 <dl>
 <dt id="saev.modeling.Session.acts_store"><code class="name">var <span class="ident">acts_store</span> : <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a></code></dt>
 <dd>
-<div class="desc"><p>Alias for field number 2</p></div>
+<div class="desc"><p>Alias for field number 3</p></div>
+</dd>
+<dt id="saev.modeling.Session.cfg"><code class="name">var <span class="ident">cfg</span> : <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
 </dd>
 <dt id="saev.modeling.Session.sae"><code class="name">var <span class="ident">sae</span> : <a title="saev.modeling.SparseAutoencoder" href="#saev.modeling.SparseAutoencoder">SparseAutoencoder</a></code></dt>
 <dd>
-<div class="desc"><p>Alias for field number 1</p></div>
+<div class="desc"><p>Alias for field number 2</p></div>
 </dd>
 <dt id="saev.modeling.Session.vit"><code class="name">var <span class="ident">vit</span> : <a title="saev.modeling.RecordedVit" href="#saev.modeling.RecordedVit">RecordedVit</a></code></dt>
 <dd>
-<div class="desc"><p>Alias for field number 0</p></div>
+<div class="desc"><p>Alias for field number 1</p></div>
 </dd>
 </dl>
 </dd>
 <dt id="saev.modeling.SparseAutoencoder"><code class="flex name class">
 <span>class <span class="ident">SparseAutoencoder</span></span>
-<span>(</span><span>cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>)</span>
+<span>(</span><span>d_vit: int, d_sae: int, l1_coeff: float, use_ghost_grads: bool)</span>
 </code></dt>
 <dd>
-<div class="desc"><p>Base class for all neural network modules.</p>
-<p>Your models should also subclass this class.</p>
-<p>Modules can also contain other Modules, allowing to nest them in
-a tree structure. You can assign the submodules as regular attributes::</p>
-<pre><code>import torch.nn as nn
-import torch.nn.functional as F
-
-class Model(nn.Module):
-    def __init__(self) -&gt; None:
-        super().__init__()
-        self.conv1 = nn.Conv2d(1, 20, 5)
-        self.conv2 = nn.Conv2d(20, 20, 5)
-
-    def forward(self, x):
-        x = F.relu(self.conv1(x))
-        return F.relu(self.conv2(x))
-</code></pre>
-<p>Submodules assigned in this way will be registered, and will have their
-parameters converted too when you call :meth:<code>to</code>, etc.</p>
-<div class="admonition note">
-<p class="admonition-title">Note</p>
-<p>As per the example above, an <code>__init__()</code> call to the parent class
-must be made before assignment on the child.</p>
-</div>
-<p>:ivar training: Boolean represents whether this module is in training or
-evaluation mode.
-:vartype training: bool</p>
+<div class="desc"><p>Sparse auto-encoder (SAE) using L1 sparsity penalty.</p>
 <p>Initialize internal Module state, shared by both nn.Module and ScriptModule.</p></div>
 <details class="source">
 <summary>
@@ -465,58 +475,49 @@ <h3>Instance variables</h3>
 </summary>
 <pre><code class="python">@beartype.beartype
 class SparseAutoencoder(torch.nn.Module):
-    def __init__(self, cfg: config.Config):
+    &#34;&#34;&#34;
+    Sparse auto-encoder (SAE) using L1 sparsity penalty.
+    &#34;&#34;&#34;
+
+    l1_coeff: float
+    use_ghost_grads: bool
+
+    def __init__(self, d_vit: int, d_sae: int, l1_coeff: float, use_ghost_grads: bool):
         super().__init__()
-        if not isinstance(cfg.d_in, int):
-            raise ValueError(
-                f&#34;d_in must be an int but was {cfg.d_in=}; {type(cfg.d_in)=}&#34;
-            )
 
-        self.cfg = cfg
-        self.l1_coefficient = cfg.l1_coefficient
-        self.dtype = cfg.dtype
-        self.device = cfg.device
+        self.l1_coeff = l1_coeff
+        self.use_ghost_grads = use_ghost_grads
 
+        # Initialize the weights.
         # NOTE: if using resampling neurons method, you must ensure that we initialise the weights in the order W_enc, b_enc, W_dec, b_dec
         self.W_enc = torch.nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(cfg.d_in, cfg.d_sae, dtype=self.dtype, device=self.device)
-            )
-        )
-        self.b_enc = torch.nn.Parameter(
-            torch.zeros(cfg.d_sae, dtype=self.dtype, device=self.device)
+            torch.nn.init.kaiming_uniform_(torch.empty(d_vit, d_sae))
         )
+        self.b_enc = torch.nn.Parameter(torch.zeros(d_sae))
 
         self.W_dec = torch.nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(cfg.d_sae, cfg.d_in, dtype=self.dtype, device=self.device)
-            )
+            torch.nn.init.kaiming_uniform_(torch.empty(d_sae, d_vit))
         )
 
         with torch.no_grad():
             # Anthropic normalizes this to have unit columns
             self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
 
-        self.b_dec = torch.nn.Parameter(
-            torch.zeros(cfg.d_in, dtype=self.dtype, device=self.device)
-        )
+        self.b_dec = torch.nn.Parameter(torch.zeros(d_vit))
 
     @jaxtyped(typechecker=beartype.beartype)
     def forward(self, x: Float[Tensor, &#34;batch d_model&#34;], dead_neuron_mask=None):
-        # move x to correct dtype
-        x = x.to(self.dtype)
-
         # Remove encoder bias as per Anthropic
         h_pre = (
             einops.einsum(
-                x - self.b_dec, self.W_enc, &#34;... d_in, d_in d_sae -&gt; ... d_sae&#34;
+                x - self.b_dec, self.W_enc, &#34;... d_vit, d_vit d_sae -&gt; ... d_sae&#34;
             )
             + self.b_enc
         )
         f_x = torch.nn.functional.relu(h_pre)
 
         x_hat = (
-            einops.einsum(f_x, self.W_dec, &#34;... d_sae, d_sae d_in -&gt; ... d_in&#34;)
+            einops.einsum(f_x, self.W_dec, &#34;... d_sae, d_sae d_vit -&gt; ... d_vit&#34;)
             + self.b_dec
         )
 
@@ -525,9 +526,9 @@ <h3>Instance variables</h3>
             torch.pow((x_hat - x.float()), 2) / (x**2).sum(dim=-1, keepdim=True).sqrt()
         )
 
-        mse_loss_ghost_resid = torch.tensor(0.0, dtype=self.dtype, device=self.device)
+        ghost_loss = torch.tensor(0.0, dtype=mse_loss.dtype, device=mse_loss.device)
         # gate on config and training so evals is not slowed down.
-        if self.cfg.use_ghost_grads and self.training and dead_neuron_mask.sum() &gt; 0:
+        if self.use_ghost_grads and self.training and dead_neuron_mask.sum() &gt; 0:
             assert dead_neuron_mask is not None
 
             # ghost protocol
@@ -541,42 +542,38 @@ <h3>Instance variables</h3>
             ghost_out = feature_acts_dead_neurons_only @ self.W_dec[dead_neuron_mask, :]
             l2_norm_ghost_out = torch.norm(ghost_out, dim=-1)
             norm_scaling_factor = l2_norm_residual / (1e-6 + l2_norm_ghost_out * 2)
-            ghost_out = ghost_out * norm_scaling_factor[:, None].detach()
+            ghost_out *= norm_scaling_factor[:, None].detach()
 
             # 3.
-            mse_loss_ghost_resid = (
+            ghost_loss = (
                 torch.pow((ghost_out - residual.detach().float()), 2)
                 / (residual.detach() ** 2).sum(dim=-1, keepdim=True).sqrt()
             )
-            mse_rescaling_factor = (mse_loss / (mse_loss_ghost_resid + 1e-6)).detach()
-            mse_loss_ghost_resid = mse_rescaling_factor * mse_loss_ghost_resid
+            mse_rescaling_factor = (mse_loss / (ghost_loss + 1e-6)).detach()
+            ghost_loss *= mse_rescaling_factor
 
-        mse_loss_ghost_resid = mse_loss_ghost_resid.mean()
+        ghost_loss = ghost_loss.mean()
         mse_loss = mse_loss.mean()
         sparsity = torch.abs(f_x).sum(dim=1).mean(dim=(0,))
-        l1_loss = self.l1_coefficient * sparsity
-        loss = mse_loss + l1_loss + mse_loss_ghost_resid
+        l1_loss = self.l1_coeff * sparsity
+        loss = mse_loss + l1_loss + ghost_loss
 
-        return x_hat, f_x, loss, mse_loss, l1_loss, mse_loss_ghost_resid
+        return x_hat, f_x, loss, mse_loss, l1_loss, ghost_loss
 
     @torch.no_grad()
-    def initialize_b_dec(self, acts_store: CachedActivationsStore):
+    def initialize_b_dec(self, cfg: config.Config, acts_store: CachedActivationsStore):
         previous_b_dec = self.b_dec.clone().cpu()
-        assert isinstance(acts_store, CachedActivationsStore)
-        all_activations = get_sae_batches(self.cfg, acts_store).detach().cpu()
 
-        out = all_activations.mean(dim=0)
+        all_activations = get_sae_batches(cfg, acts_store).detach().cpu()
+        mean = all_activations.mean(dim=0)
 
         previous_distances = torch.norm(all_activations - previous_b_dec, dim=-1)
-        distances = torch.norm(all_activations - out, dim=-1)
+        distances = torch.norm(all_activations - mean, dim=-1)
 
-        print(&#34;Reinitializing b_dec with mean of activations&#34;)
-        print(
-            f&#34;Previous distances: {previous_distances.median(0).values.mean().item()}&#34;
-        )
-        print(f&#34;New distances: {distances.median(0).values.mean().item()}&#34;)
+        print(f&#34;Prev dist: {previous_distances.median(0).values.mean().item()}&#34;)
+        print(f&#34;New dist: {distances.median(0).values.mean().item()}&#34;)
 
-        self.b_dec.data = out.to(self.dtype).to(self.device)
+        self.b_dec.data = mean.to(self.b_dec.dtype).to(self.b_dec.device)
 
     @torch.no_grad()
     def set_decoder_norm_to_unit_norm(self):
@@ -586,109 +583,34 @@ <h3>Instance variables</h3>
     def remove_gradient_parallel_to_decoder_directions(self):
         &#34;&#34;&#34;
         Update grads so that they remove the parallel component
-            (d_sae, d_in) shape
+            (d_sae, d_vit) shape
         &#34;&#34;&#34;
 
         parallel_component = einops.einsum(
             self.W_dec.grad,
             self.W_dec.data,
-            &#34;d_sae d_in, d_sae d_in -&gt; d_sae&#34;,
+            &#34;d_sae d_vit, d_sae d_vit -&gt; d_sae&#34;,
         )
 
         self.W_dec.grad -= einops.einsum(
             parallel_component,
             self.W_dec.data,
-            &#34;d_sae, d_sae d_in -&gt; d_sae d_in&#34;,
-        )
-
-    def save_model(self, path: str):
-        &#34;&#34;&#34;
-        Basic save function for the model. Saves the model&#39;s state_dict and the config used to train it.
-        &#34;&#34;&#34;
-
-        # check if path exists
-        folder = os.path.dirname(path)
-        os.makedirs(folder, exist_ok=True)
-
-        state_dict = {&#34;cfg&#34;: self.cfg, &#34;state_dict&#34;: self.state_dict()}
-
-        if path.endswith(&#34;.pt&#34;):
-            torch.save(state_dict, path)
-        elif path.endswith(&#34;pkl.gz&#34;):
-            with gzip.open(path, &#34;wb&#34;) as f:
-                pickle.dump(state_dict, f)
-        else:
-            raise ValueError(
-                f&#34;Unexpected file extension: {path}, supported extensions are .pt and .pkl.gz&#34;
-            )
-
-        print(f&#34;Saved model to {path}&#34;)
-
-    @classmethod
-    def load_from_pretrained(cls, path: str):
-        &#34;&#34;&#34;
-        Load function for the model. Loads the model&#39;s state_dict and the config used to train it.
-        This method can be called directly on the class, without needing an instance.
-        &#34;&#34;&#34;
-
-        # Ensure the file exists
-        if not os.path.isfile(path):
-            raise FileNotFoundError(f&#34;No file found at specified path: {path}&#34;)
-
-        # Load the state dictionary
-        if path.endswith(&#34;.pt&#34;):
-            try:
-                state_dict = torch.load(path, weights_only=False)
-            except Exception as e:
-                raise IOError(f&#34;Error loading the state dictionary from .pt file: {e}&#34;)
-
-        elif path.endswith(&#34;.pkl.gz&#34;):
-            try:
-                with gzip.open(path, &#34;rb&#34;) as f:
-                    state_dict = pickle.load(f)
-            except Exception as e:
-                raise IOError(
-                    f&#34;Error loading the state dictionary from .pkl.gz file: {e}&#34;
-                )
-        elif path.endswith(&#34;.pkl&#34;):
-            try:
-                with open(path, &#34;rb&#34;) as f:
-                    state_dict = pickle.load(f)
-            except Exception as e:
-                raise IOError(f&#34;Error loading the state dictionary from .pkl file: {e}&#34;)
-        else:
-            raise ValueError(
-                f&#34;Unexpected file extension: {path}, supported extensions are .pt, .pkl, and .pkl.gz&#34;
-            )
-
-        # Ensure the loaded state contains both &#39;cfg&#39; and &#39;state_dict&#39;
-        if &#34;cfg&#34; not in state_dict or &#34;state_dict&#34; not in state_dict:
-            raise ValueError(
-                &#34;The loaded state dictionary must contain &#39;cfg&#39; and &#39;state_dict&#39; keys&#34;
-            )
-
-        # Create an instance of the class using the loaded configuration
-        instance = cls(cfg=state_dict[&#34;cfg&#34;])
-        instance.load_state_dict(state_dict[&#34;state_dict&#34;])
-
-        return instance
-
-    def get_name(self):
-        assert isinstance(self.cfg, config.Config)
-        return f&#34;sparse_autoencoder_{self.cfg.model}_{self.cfg.block_layer}_{self.cfg.module_name}_{self.cfg.d_sae}&#34;</code></pre>
+            &#34;d_sae, d_sae d_vit -&gt; d_sae d_vit&#34;,
+        )</code></pre>
 </details>
 <h3>Ancestors</h3>
 <ul class="hlist">
 <li>torch.nn.modules.module.Module</li>
 </ul>
-<h3>Static methods</h3>
+<h3>Class variables</h3>
 <dl>
-<dt id="saev.modeling.SparseAutoencoder.load_from_pretrained"><code class="name flex">
-<span>def <span class="ident">load_from_pretrained</span></span>(<span>cls, path: str)</span>
-</code></dt>
+<dt id="saev.modeling.SparseAutoencoder.l1_coeff"><code class="name">var <span class="ident">l1_coeff</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="saev.modeling.SparseAutoencoder.use_ghost_grads"><code class="name">var <span class="ident">use_ghost_grads</span> : bool</code></dt>
 <dd>
-<div class="desc"><p>Load function for the model. Loads the model's state_dict and the config used to train it.
-This method can be called directly on the class, without needing an instance.</p></div>
+<div class="desc"></div>
 </dd>
 </dl>
 <h3>Methods</h3>
@@ -699,14 +621,8 @@ <h3>Methods</h3>
 <dd>
 <div class="desc"></div>
 </dd>
-<dt id="saev.modeling.SparseAutoencoder.get_name"><code class="name flex">
-<span>def <span class="ident">get_name</span></span>(<span>self)</span>
-</code></dt>
-<dd>
-<div class="desc"></div>
-</dd>
 <dt id="saev.modeling.SparseAutoencoder.initialize_b_dec"><code class="name flex">
-<span>def <span class="ident">initialize_b_dec</span></span>(<span>self, acts_store: <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a>)</span>
+<span>def <span class="ident">initialize_b_dec</span></span>(<span>self, cfg: <a title="saev.config.Config" href="config.html#saev.config.Config">Config</a>, acts_store: <a title="saev.modeling.CachedActivationsStore" href="#saev.modeling.CachedActivationsStore">CachedActivationsStore</a>)</span>
 </code></dt>
 <dd>
 <div class="desc"></div>
@@ -716,13 +632,7 @@ <h3>Methods</h3>
 </code></dt>
 <dd>
 <div class="desc"><p>Update grads so that they remove the parallel component
-(d_sae, d_in) shape</p></div>
-</dd>
-<dt id="saev.modeling.SparseAutoencoder.save_model"><code class="name flex">
-<span>def <span class="ident">save_model</span></span>(<span>self, path: str)</span>
-</code></dt>
-<dd>
-<div class="desc"><p>Basic save function for the model. Saves the model's state_dict and the config used to train it.</p></div>
+(d_sae, d_vit) shape</p></div>
 </dd>
 <dt id="saev.modeling.SparseAutoencoder.set_decoder_norm_to_unit_norm"><code class="name flex">
 <span>def <span class="ident">set_decoder_norm_to_unit_norm</span></span>(<span>self)</span>
@@ -747,13 +657,13 @@ <h3>Methods</h3>
 </li>
 <li><h3><a href="#header-functions">Functions</a></h3>
 <ul class="">
-<li><code><a title="saev.modeling.filter_no_caption_or_no_image" href="#saev.modeling.filter_no_caption_or_no_image">filter_no_caption_or_no_image</a></code></li>
+<li><code><a title="saev.modeling.dump" href="#saev.modeling.dump">dump</a></code></li>
 <li><code><a title="saev.modeling.get_acts_filepath" href="#saev.modeling.get_acts_filepath">get_acts_filepath</a></code></li>
 <li><code><a title="saev.modeling.get_cache_dir" href="#saev.modeling.get_cache_dir">get_cache_dir</a></code></li>
-<li><code><a title="saev.modeling.get_hf_dataloader" href="#saev.modeling.get_hf_dataloader">get_hf_dataloader</a></code></li>
+<li><code><a title="saev.modeling.get_imagenet_dataloader" href="#saev.modeling.get_imagenet_dataloader">get_imagenet_dataloader</a></code></li>
 <li><code><a title="saev.modeling.get_sae_batches" href="#saev.modeling.get_sae_batches">get_sae_batches</a></code></li>
-<li><code><a title="saev.modeling.get_wds_dataloader" href="#saev.modeling.get_wds_dataloader">get_wds_dataloader</a></code></li>
-<li><code><a title="saev.modeling.log_and_continue" href="#saev.modeling.log_and_continue">log_and_continue</a></code></li>
+<li><code><a title="saev.modeling.get_tol_dataloader" href="#saev.modeling.get_tol_dataloader">get_tol_dataloader</a></code></li>
+<li><code><a title="saev.modeling.load" href="#saev.modeling.load">load</a></code></li>
 <li><code><a title="saev.modeling.save_acts" href="#saev.modeling.save_acts">save_acts</a></code></li>
 </ul>
 </li>
@@ -780,8 +690,9 @@ <h4><code><a title="saev.modeling.RecordedVit" href="#saev.modeling.RecordedVit"
 </li>
 <li>
 <h4><code><a title="saev.modeling.Session" href="#saev.modeling.Session">Session</a></code></h4>
-<ul class="">
+<ul class="two-column">
 <li><code><a title="saev.modeling.Session.acts_store" href="#saev.modeling.Session.acts_store">acts_store</a></code></li>
+<li><code><a title="saev.modeling.Session.cfg" href="#saev.modeling.Session.cfg">cfg</a></code></li>
 <li><code><a title="saev.modeling.Session.from_cfg" href="#saev.modeling.Session.from_cfg">from_cfg</a></code></li>
 <li><code><a title="saev.modeling.Session.from_disk" href="#saev.modeling.Session.from_disk">from_disk</a></code></li>
 <li><code><a title="saev.modeling.Session.sae" href="#saev.modeling.Session.sae">sae</a></code></li>
@@ -792,12 +703,11 @@ <h4><code><a title="saev.modeling.Session" href="#saev.modeling.Session">Session
 <h4><code><a title="saev.modeling.SparseAutoencoder" href="#saev.modeling.SparseAutoencoder">SparseAutoencoder</a></code></h4>
 <ul class="">
 <li><code><a title="saev.modeling.SparseAutoencoder.forward" href="#saev.modeling.SparseAutoencoder.forward">forward</a></code></li>
-<li><code><a title="saev.modeling.SparseAutoencoder.get_name" href="#saev.modeling.SparseAutoencoder.get_name">get_name</a></code></li>
 <li><code><a title="saev.modeling.SparseAutoencoder.initialize_b_dec" href="#saev.modeling.SparseAutoencoder.initialize_b_dec">initialize_b_dec</a></code></li>
-<li><code><a title="saev.modeling.SparseAutoencoder.load_from_pretrained" href="#saev.modeling.SparseAutoencoder.load_from_pretrained">load_from_pretrained</a></code></li>
+<li><code><a title="saev.modeling.SparseAutoencoder.l1_coeff" href="#saev.modeling.SparseAutoencoder.l1_coeff">l1_coeff</a></code></li>
 <li><code><a title="saev.modeling.SparseAutoencoder.remove_gradient_parallel_to_decoder_directions" href="#saev.modeling.SparseAutoencoder.remove_gradient_parallel_to_decoder_directions">remove_gradient_parallel_to_decoder_directions</a></code></li>
-<li><code><a title="saev.modeling.SparseAutoencoder.save_model" href="#saev.modeling.SparseAutoencoder.save_model">save_model</a></code></li>
 <li><code><a title="saev.modeling.SparseAutoencoder.set_decoder_norm_to_unit_norm" href="#saev.modeling.SparseAutoencoder.set_decoder_norm_to_unit_norm">set_decoder_norm_to_unit_norm</a></code></li>
+<li><code><a title="saev.modeling.SparseAutoencoder.use_ghost_grads" href="#saev.modeling.SparseAutoencoder.use_ghost_grads">use_ghost_grads</a></code></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/saev/sessions.html b/docs/saev/sessions.html
deleted file mode 100644
index ff2beb3..0000000
--- a/docs/saev/sessions.html
+++ /dev/null
@@ -1,57 +0,0 @@
-<!doctype html>
-<html lang="en">
-<head>
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
-<meta name="generator" content="pdoc3 0.11.1">
-<title>saev.sessions API documentation</title>
-<meta name="description" content="">
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
-<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
-<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
-<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
-<script type="text/x-mathjax-config">MathJax.Hub.Config({ tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], processEscapes: true } });</script>
-<script async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/latest.js?config=TeX-AMS_CHTML" integrity="sha256-kZafAc6mZvK3W3v1pHOcUix30OHQN6pU/NO2oFkqZVw=" crossorigin></script>
-<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
-<script>window.addEventListener('DOMContentLoaded', () => {
-hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
-hljs.highlightAll();
-})</script>
-</head>
-<body>
-<main>
-<article id="content">
-<header>
-<h1 class="title">Module <code>saev.sessions</code></h1>
-</header>
-<section id="section-intro">
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-</section>
-</article>
-<nav id="sidebar">
-<div class="toc">
-<ul></ul>
-</div>
-<ul id="index">
-<li><h3>Super-module</h3>
-<ul>
-<li><code><a title="saev" href="index.html">saev</a></code></li>
-</ul>
-</li>
-</ul>
-</nav>
-</main>
-<footer id="footer">
-<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.1</a>.</p>
-</footer>
-</body>
-</html>
diff --git a/docs/saev/vits.html b/docs/saev/vits.html
deleted file mode 100644
index 6da39d6..0000000
--- a/docs/saev/vits.html
+++ /dev/null
@@ -1,57 +0,0 @@
-<!doctype html>
-<html lang="en">
-<head>
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
-<meta name="generator" content="pdoc3 0.11.1">
-<title>saev.vits API documentation</title>
-<meta name="description" content="">
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
-<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
-<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
-<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
-<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
-<script type="text/x-mathjax-config">MathJax.Hub.Config({ tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], processEscapes: true } });</script>
-<script async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/latest.js?config=TeX-AMS_CHTML" integrity="sha256-kZafAc6mZvK3W3v1pHOcUix30OHQN6pU/NO2oFkqZVw=" crossorigin></script>
-<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
-<script>window.addEventListener('DOMContentLoaded', () => {
-hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
-hljs.highlightAll();
-})</script>
-</head>
-<body>
-<main>
-<article id="content">
-<header>
-<h1 class="title">Module <code>saev.vits</code></h1>
-</header>
-<section id="section-intro">
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-</section>
-<section>
-</section>
-</article>
-<nav id="sidebar">
-<div class="toc">
-<ul></ul>
-</div>
-<ul id="index">
-<li><h3>Super-module</h3>
-<ul>
-<li><code><a title="saev" href="index.html">saev</a></code></li>
-</ul>
-</li>
-</ul>
-</nav>
-</main>
-<footer id="footer">
-<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.1</a>.</p>
-</footer>
-</body>
-</html>
diff --git a/saev/__init__.py b/saev/__init__.py
index 786c82d..a9bd478 100644
--- a/saev/__init__.py
+++ b/saev/__init__.py
@@ -1,3 +1,9 @@
+"""
+saev is a Python package for training sparse autoencoders (SAEs) on vision transformers (ViTs) in PyTorch.
+
+The main entrypoint to the package is in [main.py](https://github.com/samuelstevens/saev/blob/main/main.py); use `python main.py --help` to see the options and documentation for the script.
+"""
+
 from .config import Config
 
 __all__ = ["Config"]
diff --git a/saev/sessions.py b/saev/sessions.py
deleted file mode 100644
index 8b13789..0000000
--- a/saev/sessions.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/saev/vits.py b/saev/vits.py
deleted file mode 100644
index 8b13789..0000000
--- a/saev/vits.py
+++ /dev/null
@@ -1 +0,0 @@
-