diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp
index 58a9ea49..4519cfd6 100644
--- a/src/cuda/api/array.hpp
+++ b/src/cuda/api/array.hpp
@@ -270,16 +270,20 @@ array_t<T, NumDimensions> wrap(
 	return { device_id, context_handle, handle, dimensions };
 }
 
+/// Create a new (typed) CUDA array of the specified dimensions
+///@{
+/// @param context ... in which the array is to be created
 template <typename T, dimensionality_t NumDimensions>
 array_t<T,NumDimensions> create(
 	const context_t&             context,
 	dimensions_t<NumDimensions>  dimensions);
 
+/// @param device ... in whose primary context the array is to be created
 template <typename T, dimensionality_t NumDimensions>
 array_t<T,NumDimensions> create(
 	const device_t&              device,
 	dimensions_t<NumDimensions>  dimensions);
-
+///@}
 
 } // namespace array
 
diff --git a/src/cuda/api/common_ptx_compilation_options.hpp b/src/cuda/api/common_ptx_compilation_options.hpp
index 6a92a206..24a9b9e6 100644
--- a/src/cuda/api/common_ptx_compilation_options.hpp
+++ b/src/cuda/api/common_ptx_compilation_options.hpp
@@ -193,7 +193,7 @@ struct common_ptx_compilation_options_t {
 	bool generate_relocatable_device_code { false };
 
 	// What about store caching?
-};
+}; // common_ptx_compilation_options_t
 
 } // namespace rtc
 } // namespace cuda
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
index 1933d66e..ca97c5ea 100644
--- a/src/cuda/api/context.hpp
+++ b/src/cuda/api/context.hpp
@@ -210,6 +210,15 @@ inline context::flags_t get_flags(handle_t handle)
 
 } // namespace context
 
+/**
+ * Waits for all previously-scheduled tasks on all streams (= queues)
+ * in a CUDA context to conclude, before returning.
+ *
+ * Depending on the `host_thread_sync_scheduling_policy_t` set for the
+ * specified context, the thread calling this method will either yield,
+ * spin or block until all tasks scheduled previously scheduled on streams
+ * within this context have concluded.
+ */
 inline void synchronize(const context_t& context);
 
 /**
@@ -745,10 +754,18 @@ inline handle_t create_and_push(
 /**
  * @brief creates a new context on a given device
  *
- * @param device              The device on which to create the new stream
+ * @param device
+ *     The device which the new context will regard
  * @param sync_scheduling_policy
+ *     Choice of how host threads are to perform synchronization with pending
+ *     actions in streams within this context. See
+ *     @ref host_thread_sync_scheduling_policy_t for a description of these
+ *     choices.
  * @param keep_larger_local_mem_after_resize
- * @return
+ *     If true, larger allocations of global device memory, used by kernels
+ *     requiring a larger amount of local memory, will be kept (so that future
+ *     kernels with such requirements will not trigger a re-allocation).
+ *
  * @note Until CUDA 11, there used to also be a flag for enabling/disabling
  * the ability of mapping pinned host memory to device addresses. However, it was
  * being ignored since CUDA 3.2 already, with the minimum CUDA version supported
@@ -861,6 +878,7 @@ inline context_t get_with_fallback_push()
 
 } // namespace current
 
+/// @return true if the context is the primary context of its device
 bool is_primary(const context_t& context);
 
 namespace detail_ {
diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp
index 5f18f4b2..7db1e752 100644
--- a/src/cuda/api/detail/unique_span.hpp
+++ b/src/cuda/api/detail/unique_span.hpp
@@ -37,7 +37,6 @@ namespace cuda {
  *
  * @tparam T the type of individual elements in the unique_span
  */
-
 template<typename T, typename Deleter = ::std::default_delete<T[]>>
 class unique_span : public ::cuda::span<T> {
 public: // span types
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index 4dfaf4ad..2fd91dd2 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -38,7 +38,7 @@ class pool_t;
  * @brief Waits for all previously-scheduled tasks on all streams (= queues)
  * on a specified device to conclude.
  *
- * Depending on the host_thread_sync_scheduling_policy_t set for this
+ * Depending on the host_thread_sync_scheduling_policy_t set for the specified
  * device, the thread calling this method will either yield, spin or block
  * until all tasks scheduled previously scheduled on this device have been
  * concluded.
@@ -604,11 +604,17 @@ class device_t {
 		set_flags(other_flags | static_cast<flags_type>(new_policy));
 	}
 
+	/// @returns true if the device will keep larger amounts of global device memory allocated
+	/// for use as local memory, after a kernel was executed which required a larger-than-usual
+	/// allocation
 	bool keeping_larger_local_mem_after_resize() const
 	{
 		return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
 	}
 
+	/// @brief Instructs the (primary context of) the device to keep larger amounts of global
+	/// device memory allocated for use as local memory, after a kernel was executed which
+	/// required a larger-than-usual allocation
 	void keep_larger_local_mem_after_resize(bool keep = true)
 	{
 		auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
@@ -616,6 +622,9 @@ class device_t {
 		set_flags(new_flags);
 	}
 
+	/// @brief Instructs the (primary context of) the device to discard allocations of larger
+	/// amounts of global device memory which were used by a kernel requiring a larger amount
+	/// of local memory, and has concluded execution.
 	void dont_keep_larger_local_mem_after_resize()
 	{
 		keep_larger_local_mem_after_resize(false);
diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
index d3faaf07..9899afe4 100644
--- a/src/cuda/api/launch_config_builder.hpp
+++ b/src/cuda/api/launch_config_builder.hpp
@@ -179,6 +179,8 @@ class launch_config_builder_t {
 		}
 	}
 
+	/// Use the information specified for the builder to figure out the grid and block
+	/// dimensions with which the kernel is to be launched
 	grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
 	{
 		auto result = get_unvalidated_composite_dimensions();
@@ -189,6 +191,10 @@ class launch_config_builder_t {
 	}
 
 public:
+	/// Use the information specified to the builder (and defaults for the unspecified
+	/// information) to finalize the construction of a kernel launch configuration,
+	/// which can then be passed along with the kernel to a kernel-launching function,
+	/// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch
 	launch_configuration_t build() const
 	{
 		auto result = launch_configuration_t{ get_composite_dimensions() };
@@ -392,6 +398,7 @@ class launch_config_builder_t {
 
 	}
 
+	/// Set the dimensions for each block in the intended kernel launch grid
 	launch_config_builder_t& block_dimensions(
 		grid::block_dimension_t x,
 		grid::block_dimension_t y = 1,
@@ -400,8 +407,17 @@ class launch_config_builder_t {
 		return block_dimensions(grid::block_dimensions_t{x, y, z});
 	}
 
+	/// Set the block in the intended kernel launch grid to be uni-dimensional
+	/// with a specified size
 	launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); }
 
+	/**
+	 * Set the intended kernel launch grid to have 1D blocks, of the maximum
+	 * length possible given the information specified to the builder.
+	 *
+	 * @note This will fail if neither a kernel nor a device have been chosen
+	 * for the launch.
+	 */
 	launch_config_builder_t& use_maximum_linear_block()
 	{
 		grid::block_dimension_t max_size;
@@ -424,6 +440,16 @@ class launch_config_builder_t {
 	}
 
 #if CUDA_VERSION >= 12000
+	/**
+	 * Set the dimensions of multi-block clusters within the grid.
+	 *
+	 * @note There is only a small number of possible dimension combinations of clusters;
+	 * and this function does _not_ guarantee to fail immediately if you specify an
+	 * invalid such combination.
+	 *
+	 * @note This setting does not affect the overall dimensions of the grid in terms of
+	 * blocks.
+	 */
 	launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
 	{
 #ifndef NDEBUG
@@ -434,6 +460,9 @@ class launch_config_builder_t {
 	}
 #endif
 
+	/// Set the dimension of the grid for the intended kernel launch, in terms
+	/// of blocks
+	///@{
 	launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
 	{
 #ifndef NDEBUG
@@ -447,6 +476,7 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	///@}
 	launch_config_builder_t& grid_dimensions(
 		grid::dimension_t x,
 		grid::dimension_t y = 1,
@@ -455,9 +485,17 @@ class launch_config_builder_t {
 		return grid_dimensions(grid::dimensions_t{x, y, z});
 	}
 
+	/// Set the grid for the intended launch to be one-dimensional, with a specified number
+	/// of blocks
+	///@{
 	launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); }
 	launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); }
+	///@}
+
 
+	/// Set the overall number of _threads_, in each dimension, of all blocks
+	/// in the grid of the intended kernel launch
+	///@{
 	launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims)
 	{
 #ifndef NDEBUG
@@ -474,16 +512,30 @@ class launch_config_builder_t {
 	{
 		return overall_dimensions(grid::overall_dimensions_t{x, y, z});
 	}
+	///@}
 
+	/// Set the intended launch grid to be linear, with a specified overall number of _threads_
+	/// over all (1D) blocks in the grid
 	launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); }
 
+	/**
+	 * Set whether or blocks may synchronize with each other or not
+	 *
+	 * @note recall that even "non-cooperative" blocks can still access the same global memory
+	 * locations, and can use atomic operations on such locations for (slow) synchronization.
+	 */
 	launch_config_builder_t& block_cooperation(bool cooperation)
 	{
 		thread_block_cooperation = cooperation;
 		return *this;
 	}
 
+	/// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently
+	/// (atomic global memory operations notwithstanding)
 	launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }
+
+	/// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will
+	/// work entirely independently (atomic global memory operations notwithstanding)
 	launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }
 
 	launch_config_builder_t& dynamic_shared_memory_size(
@@ -493,11 +545,18 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/// Indicate that the intended launch should not allocate any shared
+	/// memory for the kernel to use beyond the static amount necessitated
+	/// by its (compiled) code.
 	launch_config_builder_t& no_dynamic_shared_memory()
 	{
 		return dynamic_shared_memory_size(memory::shared::size_t(0));
 	}
 
+	/// Indicate that the intended launch should allocate a certain amount of shared
+	/// memory for the kernel to use beyond the static amount necessitated
+	/// by its (compiled) code.
+	///@{
 	launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size)
 	{
 #ifndef NDEBUG
@@ -512,13 +571,32 @@ class launch_config_builder_t {
 	{
 		return dynamic_shared_memory_size(size);
 	}
+	///@}
 
+	/**
+	 * Indicate that the intended launch should allocate additional shared
+	 * memory for the kernel to use beyond the static amount necessitated
+	 * by its (compiled) code - with the amount to be determined based on
+	 * the block size
+	 *
+	 * @param shared_mem_size_determiner a function determining the dynamic
+	 * shared memory size given the kernel launch block size
+	 */
 	launch_config_builder_t& dynamic_shared_memory(
 		kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
 	{
 		return dynamic_shared_memory_size(shared_mem_size_determiner);
 	}
 
+	/**
+	 * Indicate that the specified wrapped kernel will be the one launched
+	 * with the configuration to be produced by this object.  Such an indication
+	 * provides this object with information about the device and context in
+	 * which the kernel is to be launched, and ranges of possible values for
+	 * certain parameters (e.g. shared memory size, dimensions).
+	 *
+	 * @note Calling this method obviates a call to the @ref device() method.
+	 */
 	launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
 	{
 		if (device_ and kernel_->device_id() != device_.value()) {
@@ -533,6 +611,15 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/**
+	 * Indicate that the intended kernel launch would occur on (some stream in
+	 * some context on) the specified device. Such an indication provides this
+	 * object with some information regarding ranges of possible values for
+	 * certain parameters (e.g. shared memory size, dimensions).
+	 *
+	 * @note Do not call both this and the @ref kernel() method; prefer just that one.
+	 */
+	///@{
 	launch_config_builder_t& device(const device::id_t device_id)
 	{
 		if (kernel_ and kernel_->device_id() != device_id) {
@@ -548,7 +635,11 @@ class launch_config_builder_t {
 	{
 		return this->device(device.id());
 	}
+	///@}
 
+	/// Clear the association with a specific kernel (which may have been
+	/// set using the @ref kernel method)
+	///@{
 	launch_config_builder_t& kernel_independent()
 	{
 		kernel_ = nullptr;
@@ -559,13 +650,14 @@ class launch_config_builder_t {
 		kernel_ = nullptr;
 		return *this;
 	}
+	///@}
 
 	/**
-	 * @brief THis will use information about the kernel, the already-set block size,
+	 * @brief This will use information about the kernel, the already-set block size,
 	 * and the device to create a unidimensional grid of blocks to exactly saturate
 	 * the CUDA device's capacity for simultaneous active blocks.
 	 *
-	 * @note This will _not_ set the block size - unlike
+	 * @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}.
 	 */
 	launch_config_builder_t& saturate_with_active_blocks()
 	{
@@ -584,6 +676,14 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+	/**
+	 * @brief This will use information about the kernel and the device to define
+	 * a minimum launch grid which should guarantee maximum occupancy of the GPU's
+	 * multiprocessors.
+	 *
+	 * @note A builder after this call _will_ set the block dimensions - unlike
+	 * {@ref saturate_with_active_blocks()} .
+	 */
 	launch_config_builder_t& min_params_for_max_occupancy()
 	{
 		if (not (kernel_)) {
@@ -600,6 +700,7 @@ class launch_config_builder_t {
 	}
 }; // launch_config_builder_t
 
+/// A slightly shorter-named construction idiom for @ref launch_config_builder_t
 inline launch_config_builder_t launch_config_builder() { return {}; }
 
 } // namespace cuda
diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
index f891f8e0..e213e7ed 100644
--- a/src/cuda/api/launch_configuration.hpp
+++ b/src/cuda/api/launch_configuration.hpp
@@ -58,7 +58,16 @@ enum class cluster_scheduling_policy_t {
 };
 #endif
 
+/**
+ * The full set of possible configuration parameters for launching a kernel on a GPU.
+ *
+ * @note Consider using a @ref launch_configuration_builder_t to incrementally construct
+ * these structs.
+ *
+ * @note this structure must be constructed with at least the grid and block dimensions.
+ */
 struct launch_configuration_t {
+	/// Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
 	grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } };
 
 	/**
@@ -160,6 +169,20 @@ struct launch_configuration_t {
 	constexpr launch_configuration_t(const launch_configuration_t&) = default;
 	constexpr launch_configuration_t(launch_configuration_t&&) = default;
 
+	/**
+	 * Constructors corresponding to the CUDA runtime API's triple-chevron launch
+	 * syntax:
+	 *
+	 *     my_kernel <<< grid_Dims, block_dims, dynamic_shmem_size, my_stream >>> (
+	 *         arg1, arg2, arg3, etc);
+	 *
+	 * ... where the specified aspects of the launch configuration are the dimensions
+	 * and the dynamic shared memory size.
+	 *
+	 * @note The choices of stream and kernel function are _not_ part of the launch
+	 * configuration.
+	 */
+	///@{
 	constexpr launch_configuration_t(
 		grid::composite_dimensions_t grid_and_block_dimensions,
 		memory::shared::size_t dynamic_shared_mem = 0u
@@ -184,12 +207,14 @@ struct launch_configuration_t {
 		grid::block_dimensions_t(block_dims),
 		dynamic_shared_mem)
 	{ }
+	///@}
 
 	CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default;
 	CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default;
 };
 
 #if __cplusplus < 202002L
+///@cond
 constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
 {
 	return
@@ -210,6 +235,7 @@ constexpr bool operator!=(const launch_configuration_t lhs, const launch_configu
 {
 	return not (lhs == rhs);
 }
+///@endcond
 #endif
 
 namespace detail_ {
diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp
index ef55571e..bd028477 100644
--- a/src/cuda/api/link.hpp
+++ b/src/cuda/api/link.hpp
@@ -20,14 +20,14 @@ namespace cuda {
 
 ///@cond
 class device_t;
-
 class module_t;
-
 class link_t;
 ///@endcond
 
+/// Definitions related to CUDA linking-processes, captured by the  @ref link_t wrapper class
 namespace link {
 
+/// Kinds of images which can be used by the linker (some may require driver compilation work)
 enum class input_kind_t {
 	cubin,    /// Compiled device-class-specific device code
 	ptx,      /// PTX (microarchitecture-inspecific intermediate representation)
@@ -36,9 +36,14 @@ enum class input_kind_t {
 	library,  /// An archive of objects files with embedded device code; a `.a` file
 };
 
+/// A raw CUDA driver handle for a linking-process
 using handle_t = CUlinkState;
 
-// TODO: Check if the linking has been completed!
+/**
+ * @brief Wrap an existing CUDA link-process in a @ref link_t wrapper class instance.
+ *
+ * @todo : Consider checking if the linking has already been completed!
+ */
 inline link_t wrap(
 	device::id_t device_id,
 	context::handle_t context_handle,
@@ -48,21 +53,19 @@ inline link_t wrap(
 
 inline link_t create(const void *image, const link::options_t &options);
 
-// TODO: Use a clase-class with C++17 of later, made up of the two classes here
+/// Definitions relating to inputs to CUDA linking-processes
 namespace input {
 
-/**
- * A typed, named, image in memory which can be used as an input to a runtime
- * CUDA linking process.
- */
+/// A typed, named, image in memory which can be used as an input to a runtime CUDA linking-process
 struct image_t : memory::region_t {
-	const char *name;
-	link::input_kind_t type;
+	const char *name; /// Link images are attached a name when registered in a linking-process
+	link::input_kind_t type; /// type of contents found in the memory region
 };
 
+/// A typed, named, image in a file which can be used as an input to a runtime CUDA linking-process
 struct file_t {
-	const char *path; // TODO: Use a proper path in C++14 and later
-	link::input_kind_t type;
+	const char *path;
+	link::input_kind_t type; /// type of contents found in the file
 };
 
 } // namespace input
@@ -115,7 +118,16 @@ class link_t {
 		return memory::region_t{cubin_output_start, cubin_output_size};
 	}
 
-	// TODO: Replace this with methods which take wrapper classes.
+	/**
+	 * Add another linkable image, from memory, to this linking-process
+	 *
+	 * @param[in] image Memory region containing the image
+	 * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary,
+	 *     before linking.
+	 *
+	 * @note some types of linkable images are not, in fact, even compiled - but can be compiled
+	 * by the driver with the specified @p options.
+	 */
 	void add(link::input::image_t image, const link::options_t &ptx_compilation_options = {}) const
 	{
 		auto marshalled_options = link::detail_::marshal(ptx_compilation_options);
@@ -134,6 +146,17 @@ class link_t {
 			+ ::std::to_string(static_cast<int>(image.type)) + " to a link.");
 	}
 
+	/**
+	 * Add another linkable image, from a file, to this linking-process
+	 *
+	 * @param[in] file_input Path of the image file to be added
+	 * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary,
+	 *     before linking.
+	 *
+	 * @note some types of linkable images are not, in fact, even compiled - but can be compiled
+	 * by the driver with the specified @p options.
+	 */
+	///@{
 	void add_file(link::input::file_t file_input, const link::options_t &options) const
 	{
 		auto marshalled_options = link::detail_::marshal(options);
@@ -156,6 +179,7 @@ class link_t {
 		return add_file(path.c_str(), file_contents_type);
 	}
 #endif
+	///@}
 
 protected: // constructors
 
@@ -219,6 +243,7 @@ class link_t {
 
 namespace link {
 
+/// Create a new link-process (before adding any compiled images or or image-files)
 inline link_t create(const link::options_t &options = link::options_t{})
 {
 	handle_t new_link_handle;
@@ -241,13 +266,12 @@ inline link_t create(const link::options_t &options = link::options_t{})
 		do_take_ownership);
 }
 
-// TODO: Check if the linking has been completed!
 inline link_t wrap(
-	device::id_t device_id,
-	context::handle_t context_handle,
-	link::handle_t handle,
-	const link::options_t &options,
-	bool take_ownership) noexcept
+	device::id_t             device_id,
+	context::handle_t        context_handle,
+	link::handle_t           handle,
+	const link::options_t &  options,
+	bool                     take_ownership) noexcept
 {
 	return link_t{device_id, context_handle, handle, options, take_ownership};
 }
diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp
index a2e17d59..52d98206 100644
--- a/src/cuda/api/link_options.hpp
+++ b/src/cuda/api/link_options.hpp
@@ -1,7 +1,8 @@
 /**
  * @file
  *
- * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code.
+ * @brief Definitions and utility functions relating to just-in-time compilation, assembly
+ * and linking of CUDA code.
  */
 #pragma once
 #ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_
@@ -21,17 +22,37 @@ class module_t;
 
 namespace link {
 
+/// Possible strategies for obtaining fully-compiled binary code for a target device
+/// when it is not immediately available.
 enum fallback_strategy_for_binary_code_t {
+	/// Prefer compiling available PTX code to produce fully-compiled binary code
 	prefer_compiling_ptx            = 0,
+	/// Prefer using existing fully-compiled (binary) code, for a compatible but
+	/// not identical target device
 	prefer_using_compatible_binary  = 1,
 };
 
 namespace detail_ {
 
+/// The CUDA driver's raw generic JIT-related option type
 using option_t = CUjit_option;
 
+/**
+ * Mechanism for finalizing options into a format readily usable by the
+ * link_t wrapper (and by the `cuLink`- functions - but highly inconvenient
+ * for inspection and modification.
+ *
+ * @note Don't create these yourself unless you have to; use @ref options_t
+ * instead, and @ref options_t::marshal() when done, for completing the
+ * linking-process. If you must create them - use `push_back()` method
+ * repeatedly until done with all options.
+ */
 struct marshalled_options_t {
+	/// The CUDA driver's expected type for number of link-related options
 	using size_type = unsigned;
+
+	/// The CUDA driver's enum for option identification has this many values -
+	/// and thus, there is need for no more than this many marshalled options
 	constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS };
 
 protected:
@@ -39,8 +60,6 @@ struct marshalled_options_t {
 	::std::array<void*, max_num_options> value_buffer;
 	size_type count_ { 0 };
 public:
-	size_type count() const { return count_; }
-
 	void push_back(option_t option)
 	{
 		if (count_ >= max_num_options) {
@@ -76,7 +95,12 @@ struct marshalled_options_t {
 	}
 
 public:
-
+	/**
+	 * This method (alone) is used to populate this structure.
+	 *
+	 * @note The class is not a standard container, and this method cannot be
+	 * reversed or undone, i.e. there is no `pop_back()` or `pop()`.
+	 */
 	template <typename T>
 	void push_back(option_t option, T value)
 	{
@@ -85,25 +109,46 @@ struct marshalled_options_t {
 		// Now set value_buffer[count-1]...
 		value_buffer[count_-1] = process_value(value);
 	}
+
+	/// These three methods yield what the CUDA driver actually expects:
+	/// Two matching raw buffers and their count of elements
+	///@{
 	const option_t* options() const { return option_buffer.data(); }
 	const void * const * values() const { return value_buffer.data(); }
+	size_type count() const { return count_; }
+	///@}
 };
 
 } // namespace detail_
 
+/**
+ * A convenience class for holding, setting and inspecting options for a CUDA binary code
+ * linking process - which may also involve PTX compilation.
+ *
+ * @note This structure does not let you set those options which the CUDA driver documentation
+ * describes as having internal purposes only.
+ */
 struct options_t final : public rtc::common_ptx_compilation_options_t {
 
+	/// options related to logging the link-process
 	struct {
+		/// Non-error information regarding the logging process (i.e. its "standard output" stream)
 		optional<span<char>> info;
+
+		/// Information regarding errors in the logging process (i.e. its "standard error" stream)
 		optional<span<char>> error;
+
+		/// Control whether the info and error logging will be verbose
 		bool verbose;
 	} logs;
 
-	// Note: When this is true, the specific_target of the base class
-	// is overridden
+	/// Instead of using explicitly-specified binary target, from
+	/// @ref common_ptx_compilation_options_t::specific_target - use the device of the current CUDA
+	/// context as the target for binary generation
 	bool obtain_target_from_cuda_context { true };
 
-	/// fallback behavior if a (matching cubin???) is not found
+	/// Possible strategy for obtaining fully-compiled binary code when it is not
+	/// simply available in the input to the link-process
 	optional<fallback_strategy_for_binary_code_t> fallback_strategy_for_binary_code;
 
 	// Ignoring the "internal purposes only" options;
@@ -118,6 +163,8 @@ struct options_t final : public rtc::common_ptx_compilation_options_t {
 
 namespace detail_ {
 
+/// Construct a easily-driver-usable link-process options structure from
+/// a more user-friendly `options_t` structure.
 inline marshalled_options_t marshal(const options_t& link_options)
 {
 	marshalled_options_t marshalled{};
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index ce72ecae..24f286a5 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -85,7 +85,7 @@ enum cpu_write_combining : bool {
 };
 
 /**
- * @brief options accepted by CUDA's allocator of memory with a host-side aspect
+ * options accepted by CUDA's allocator of memory with a host-side aspect
  * (host-only or managed memory).
  */
 struct allocation_options {
@@ -106,6 +106,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 
 /**
  * @namespace mapped
+ *
  * Memory regions appearing in both on the host-side and device-side address
  * spaces with the regions in both spaces mapped to each other (i.e. guaranteed
  * to have the same contents on access up to synchronization details). Consult the
@@ -124,7 +125,7 @@ struct span_pair_t {
 };
 
 /**
- * @brief A pair of memory regions, one in system (=host) memory and one on a
+ * A pair of memory regions, one in system (=host) memory and one on a
  * CUDA device's memory - mapped to each other
  *
  * @note this is the mapped-pair equivalent of a `void *`; it is not a
@@ -143,9 +144,7 @@ struct region_pair_t {
 
 } // namespace mapped
 
-/**
- * @brief CUDA-Device-global memory on a single device (not accessible from the host)
- */
+///CUDA-Device-global memory on a single device (not accessible from the host)
 namespace device {
 
 namespace detail_ {
@@ -325,7 +324,7 @@ struct deleter {
 
 
 /**
- * @brief Sets consecutive elements of a region of memory to a fixed
+ * Sets consecutive elements of a region of memory to a fixed
  * value of some width
  *
  * @note A generalization of `set()`, for different-size units.
@@ -339,7 +338,7 @@ template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements);
 
 /**
- * @brief Sets all bytes in a region of memory to a fixed value
+ * Sets all bytes in a region of memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset for CUDA device-side memory
  *
@@ -366,7 +365,7 @@ inline void set(region_t region, int byte_value)
 ///@}
 
 /**
- * @brief Sets all bytes in a region of memory to 0 (zero)
+ * Sets all bytes in a region of memory to 0 (zero)
  */
 ///@{
 /**
@@ -389,7 +388,7 @@ inline void zero(region_t region)
 ///@}
 
 /**
- * @brief Sets all bytes of a single pointed-to value to 0
+ * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a value of a certain type, in a CUDA device's
  * global memory
@@ -558,7 +557,7 @@ inline void copy(region_t destination, void* source)
 ///@}
 
 /**
- * @brief Sets a number of bytes in memory to a fixed value
+ * Sets a number of bytes in memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset - for any and all CUDA-related
  * memory spaces
@@ -571,7 +570,7 @@ inline void copy(region_t destination, void* source)
 void set(void* ptr, int byte_value, size_t num_bytes);
 
 /**
- * @brief Sets all bytes in a region of memory to a fixed value
+ * Sets all bytes in a region of memory to a fixed value
  *
  * @note The equivalent of @ref ::std::memset - for any and all CUDA-related
  * memory spaces
@@ -586,7 +585,7 @@ inline void set(region_t region, int byte_value)
 }
 
 /**
- * @brief Sets all bytes in a region of memory to 0 (zero)
+ * Sets all bytes in a region of memory to 0 (zero)
  */
 ///@{
 /**
@@ -610,7 +609,7 @@ inline void zero(void* ptr, size_t num_bytes)
 ///@}
 
 /**
- * @brief Sets all bytes of a single pointed-to value to 0
+ * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a single element of a certain type, which may
  * be in host-side memory, global CUDA-device-side memory or CUDA-managed
@@ -662,7 +661,7 @@ status_t multidim_copy(copy_parameters_t<NumDimensions> params)
 } // namespace detail_
 
 /**
- * @brief An almost-generalized-case memory copy, taking a rather complex structure of
+ * An almost-generalized-case memory copy, taking a rather complex structure of
  * copy parameters - wrapping the CUDA driver's own most-generalized-case copy
  *
  * @tparam NumDimensions The number of dimensions of the parameter structure.
@@ -1216,8 +1215,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
 
 
 /**
- * @brief Sets consecutive elements of a region of memory to a fixed
- * value of some width
+ * Sets consecutive elements of a region of memory to a fixed value of some width
  *
  * @note A generalization of `async::set()`, for different-size units.
  *
@@ -1263,7 +1261,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s
 void zero(void* start, size_t num_bytes, const stream_t& stream);
 
 /**
- * @brief Asynchronously sets all bytes of a single pointed-to value
+ * Asynchronously sets all bytes of a single pointed-to value
  * to 0 (zero).
  *
  * @note asynchronous version of @ref memory::zero(T*)
@@ -1440,6 +1438,7 @@ inline void copy(
 
 /**
  * @namespace host
+ *
  * Host-side (= system) memory which is "pinned", i.e. resides in
  * a fixed physical location - and allocated by the CUDA driver.
  */
@@ -1459,7 +1458,7 @@ inline region_t allocate(
 
 
 /**
- * allocate pinned host memory
+ * Allocates pinned host memory
  *
  * @note "pinned" memory is allocated in contiguous physical ram
  * addresses, making it possible to copy to and from it to the the
@@ -1469,25 +1468,17 @@ inline region_t allocate(
  *
  * @throws cuda::runtime_error if allocation fails for any reason
  *
- * @param context
  * @param size_in_bytes the amount of memory to allocate, in bytes
  * @param options
  *     options to pass to the cuda host-side memory allocator; see
  *     {@ref memory::allocation_options}.
  * @return a pointer to the allocated stretch of memory
  */
-///@{
-
-inline region_t allocate(
-	const context_t&    context,
-	size_t              size_in_bytes,
-	allocation_options  options);
-
-region_t allocate(
-	size_t              size_in_bytes,
-	allocation_options  options);
+region_t allocate(size_t size_in_bytes, allocation_options options);
 
 /**
+ * @copydoc allocate(size_t, allocation_options)
+ *
  * @param portability
  *     whether or not the allocated region can be used in different
  *     CUDA contexts.
@@ -1504,16 +1495,18 @@ inline region_t allocate(
 	return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
 }
 
+/// @copydoc allocate(size_t, portability_across_contexts, cpu_write_combining)
 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
 {
 	return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
 }
 
-///@}
-
 /**
- * Free a region of pinned host memory which was allocated with one of the pinned host
+ * Frees a region of pinned host memory which was allocated with one of the pinned host
  * memory allocation functions.
+ *
+ * @note The address provided must be the _beginning_ of the region of allocated memory;
+ * and the entire region is freed (i.e. the region size is known to/determined by the driver)
  */
 inline void free(void* host_ptr)
 {
@@ -1526,6 +1519,11 @@ inline void free(void* host_ptr)
 	throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
 }
 
+/**
+ * @copybrief free(void*)
+ *
+ * @param region The region of memory to free
+ */
 inline void free(region_t region) {	return free(region.data()); }
 
 namespace detail_ {
@@ -1537,9 +1535,8 @@ struct deleter {
 	void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
 };
 
-
 /**
- * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate.
+ * Makes a pre-allocated memory region behave as though it were allocated with @ref host::allocate.
  *
  * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by
  * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to
@@ -1669,7 +1666,7 @@ inline void deregister(const_region_t region)
 }
 
 /**
- * @brief Sets all bytes in a stretch of host-side memory to a single value
+ * Sets all bytes in a stretch of host-side memory to a single value
  *
  * @note a wrapper for @ref ::std::memset
  *
@@ -1703,22 +1700,6 @@ inline void zero(T* ptr)
 
 } // namespace host
 
-/**
- * This type of memory, also known as _unified_ memory, appears within
- * a unified, all-system address space - and is used with the same
- * address range on the host and on all relevant CUDA devices on a
- * system. It is paged, so that it may exceed the physical size of
- * a CUDA device's global memory. The CUDA driver takes care of
- * "swapping" pages "out" from a device to host memory or "swapping"
- * them back "in", as well as of propagation of changes between
- * devices and host-memory.
- *
- * @note For more details, see
- * <a href="https://devblogs.nvidia.com/parallelforall/unified-memory-cuda-beginners/">
- * Unified Memory for CUDA Beginners</a> on the
- * <a href="https://devblogs.nvidia.com/parallelforall">Parallel4All blog</a>.
- *
- */
 namespace managed {
 
 namespace detail_ {
@@ -1899,7 +1880,7 @@ inline region_t allocate(
 } // namespace detail_
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on CUDA devices.
  *
  * @param context the initial context which is likely to access the managed
@@ -1917,7 +1898,7 @@ inline region_t allocate(
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on CUDA devices
  *
  * @param device the initial device which is likely to access the managed
@@ -1935,7 +1916,7 @@ inline region_t allocate(
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
 
 /**
- * @brief Allocate a a region of managed memory, accessible with the same
+ * Allocate a a region of managed memory, accessible with the same
  * address on the host and on all CUDA devices.
  *
  * @note While the allocated memory should be available universally, the
@@ -2007,7 +1988,7 @@ inline void prefetch(
 } // namespace detail_
 
 /**
- * @brief Prefetches a region of managed memory to a specific device, so
+ * Prefetches a region of managed memory to a specific device, so
  * it can later be used there without waiting for I/O from the host or other
  * devices.
  */
@@ -2017,7 +1998,7 @@ void prefetch(
 	const stream_t&        stream);
 
 /**
- * @brief Prefetches a region of managed memory into host memory. It can
+ * Prefetches a region of managed memory into host memory. It can
  * later be used there without waiting for I/O from any of the CUDA devices.
  */
 void prefetch_to_host(
@@ -2194,6 +2175,7 @@ inline bool is_part_of_a_region_pair(const void* ptr)
 
 namespace device {
 
+/// A unique span of device-global memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
@@ -2212,25 +2194,39 @@ unique_span<T> make_unique_span(const context::handle_t context_handle, size_t s
 } // namespace detail_
 
 /**
- * @brief Create a variant of ::std::unique_pointer for an array in
- * device-global memory.
+ * Allocate memory for a consecutive sequence of typed elements in device-global memory.
  *
- * @note CUDA's runtime API always has a current device; but -
- * there is not necessary a current context; so a primary context
- * for a device may be created through this call.
+ * @tparam T  type of the individual elements in the allocated sequence
  *
- * @tparam T  an array type; _not_ the type of individual elements
+ * @param context The CUDA device context in which to make the allocation.
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
  *
- * @param context       The CUDA device context in which to make the
- *                      allocation.
- * @param num_elements  the number of elements to allocate
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
  *
- * @return an ::std::unique_ptr pointing to the constructed T array
-*/
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user beware.
+ */
 template <typename T>
 unique_span<T> make_unique_span(const context_t& context, size_t size);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @param device The CUDA device in whose primary context to make the allocation.
+ */
 template <typename T>
 unique_span<T> make_unique_span(const device_t& device, size_t size);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @note The current device's primary context will be used (_not_ the
+ * current context).
+ */
 template <typename T>
 unique_span<T> make_unique_span(size_t size);
 
@@ -2243,7 +2239,7 @@ inline device::unique_span<T> make_unique_span(const context_t& context, size_t
 	return device::make_unique_span<T>(context, num_elements);
 }
 
-/// See @ref `device::make_unique_span(const device_t& device, size_t num_elements)`
+/// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)`
 template <typename T>
 inline device::unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
 {
@@ -2252,9 +2248,29 @@ inline device::unique_span<T> make_unique_span(const device_t& device, size_t nu
 
 namespace host {
 
+/// A unique span of CUDA-driver-allocated, pinned host (=system) memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
+/**
+ * Allocate memory for a consecutive sequence of typed elements in system
+ * (host-side) memory.
+ *
+ * @tparam T  type of the individual elements in the allocated sequence
+ *
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
+ * memory upon destruction)
+ *
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
+ *
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user
+ * beware, especially since this is host-side memory.
+ */
 template <typename T>
 unique_span<T> make_unique_span(size_t size)
 {
@@ -2265,6 +2281,7 @@ unique_span<T> make_unique_span(size_t size)
 
 namespace managed {
 
+/// A unique span of CUDA-driver-allocated managed memory
 template <typename T>
 using unique_span = cuda::unique_span<T, detail_::deleter>;
 
@@ -2282,16 +2299,48 @@ unique_span<T> make_unique_span(
 
 } // namespace detail_
 
+/**
+ * Allocate memory for a consecutive sequence of typed elements in system
+ * (host-side) memory.
+ *
+ * @tparam T  type of the individual elements in the allocated sequence
+ *
+ * @param size the number of elements to allocate
+ * @return A @ref unique_span which owns the allocated memory (and will release said
+ * memory upon destruction)
+ *
+ * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except
+ * that the returned value is not "just" a unique pointer, but also has a size. It is also
+ * similar to {@ref cuda::device::make_unique_region}, except that the allocation is
+ * conceived as typed elements.
+ *
+ * @note Typically, this is used for trivially-constructible elements, for which reason the
+ * non-construction of individual elements should not pose a problem. But - let the user
+ * beware, especially since this is host-side memory.
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	const context_t&      context,
 	size_t                size,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @param device The CUDA device in whose primary context to make the allocation.
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	const device_t&       device,
 	size_t                size,
 	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_span(const context_t&, size_t)
+ *
+ * @note The current device's primary context will be used (_not_ the
+ * current context).
+ */
 template <typename T>
 unique_span<T> make_unique_span(
 	size_t                size,
@@ -2302,6 +2351,7 @@ unique_span<T> make_unique_span(
 } // namespace memory
 
 namespace symbol {
+
 /**
  * Locates a CUDA symbol in global or constant device memory
  *
diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp
index 6d04f547..1913ac0d 100644
--- a/src/cuda/api/miscellany.hpp
+++ b/src/cuda/api/miscellany.hpp
@@ -30,6 +30,13 @@ inline void initialize_driver()
 	throw_if_error_lazy(status, "Failed initializing the CUDA driver");
 }
 
+/**
+ * A mechanism for ensuring a @ref cuInit() call has been made, to use before making
+ * any other driver API calls.
+ *
+ * @note differs from simply calling `initialize_driver()` in that repeated calls
+ * from the same thread will avoid additional @ref cuInit() call.
+ */
 inline void ensure_driver_is_initialized()
 {
 	thread_local bool driver_known_to_be_initialized{false};
diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp
index f69315e4..b06d694c 100644
--- a/src/cuda/api/module.hpp
+++ b/src/cuda/api/module.hpp
@@ -30,10 +30,13 @@ class kernel_t;
 
 namespace module {
 
+// The CUDA driver's raw handle for modules
 using handle_t = CUmodule;
 
 namespace detail_ {
 
+/// Construct a module proxy object - for an existing module - from the class'
+/// constituent fields
 inline module_t wrap(
 	device::id_t            device_id,
 	context::handle_t       context_handle,
@@ -70,6 +73,8 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i
  *     be loaded (and in which the module contents may be used)
  * @param[in] module_data the opaque, raw binary data for the module - in a contiguous container
  *     such as a span, a cuda::unique_span etc..
+ * @param link_options Potential options for the PTX compilation and linking of the compiled
+ *     device-side code.
  */
 ///@{
 template <typename Locus, typename ContiguousContainer,
@@ -97,17 +102,21 @@ module_t create(
 class module_t {
 
 public: // getters
-
+	/// Getters for the module object's raw constituent fields
+	///@{
 	module::handle_t handle() const { return handle_; }
 	context::handle_t context_handle() const { return context_handle_; }
 	device::id_t device_id() const { return device_id_; }
+	///@}
 
+	/// @returns the context in which this module exists
 	context_t context() const;
+
+	/// @returns the device with which this module is associated
 	device_t device() const;
 
 	/**
-	 * Obtains an already-compiled kernel previously associated with
-	 * this module.
+	 * Obtains a kernel constituting part of this module
 	 *
 	 * @param name The function name, in case of a C-style function,
 	 * or the mangled function signature, in case of a C++-style
@@ -117,11 +126,13 @@ class module_t {
 	 */
 	cuda::kernel_t get_kernel(const char* name) const;
 
+	/// @copydoc get_kernel(const char*) const
 	cuda::kernel_t get_kernel(const ::std::string& name) const
 	{
 		return get_kernel(name.c_str());
 	}
 
+	/// Get the mapping of a named memory region in this module to actual memory
 	memory::region_t get_global_region(const char* name) const
 	{
 		CUdeviceptr dptr;
@@ -134,7 +145,14 @@ class module_t {
 	// TODO: Implement a surface reference and texture reference class rather than these raw pointers.
 
 #if CUDA_VERSION < 12000
+	/// A "plug" of a method regarding surfaces, which modules support but our wrappers don't
+	/// really cater to the use of
 	CUsurfref get_surface(const char* name) const;
+
+	/// A "plug" of a method regarding texture references
+	///
+	/// TODO: Beef this up into a proper method and consider relations between texture references,
+	/// texture objects and texture views
 	CUtexref get_texture_reference(const char* name) const;
 #endif
 
@@ -247,15 +265,15 @@ inline module_t load_from_file_in_current_context(
  * relevant resources for it.
  *
  * @param path of a cubin, PTX, or fatbin file constituting the module to be loaded.
+ * @param context The context in which to create a module containing the loaded data
  * @return the loaded module
  *
  * @note this covers cuModuleLoadFatBinary() even though that's not directly used
  *
- * @todo: consider adding load_module methods to context_t
- * @todo: When switching to the C++17 standard, use string_view's instead of the const char*
+ * @todo consider adding load_module methods to context_t
+ * @todo When switching to the C++17 standard, use string_view's instead of the const char*
  * and ::std::string reference
  */
-///@{
 inline module_t load_from_file(
 	const context_t&        context,
 	const char*             path)
@@ -264,6 +282,7 @@ inline module_t load_from_file(
 	return detail_::load_from_file_in_current_context(context.device_id(), context.handle(), path);
 }
 
+/// @copydoc load_from_file(const context_t&, const char*)
 inline module_t load_from_file(
 	const context_t&        context,
 	const ::std::string&    path)
@@ -271,10 +290,22 @@ inline module_t load_from_file(
 	return load_from_file(context, path.c_str());
 }
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @param device The device in whose primary context to create a module containing
+ * the loaded data
+ */
 module_t load_from_file(
 	const device_t&         device,
 	const char*             path);
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @param device The device in whose primary context to create a module containing
+ * the loaded data
+ */
 inline module_t load_from_file(
 	const device_t&         device,
 	const ::std::string&    path)
@@ -282,8 +313,15 @@ inline module_t load_from_file(
 	return load_from_file(device, path.c_str());
 }
 
+/**
+ * @copydoc load_from_file(const context_t&, const char*)
+ *
+ * @note loads the module into the primary context of the device of the current context
+ * (although not necessarily in the current context!)
+ */
 module_t load_from_file(const char* path);
 
+/// @copydoc load_from_file(const char*)
 inline module_t load_from_file(const ::std::string& path)
 {
 	return load_from_file(path.c_str());
@@ -291,6 +329,7 @@ inline module_t load_from_file(const ::std::string& path)
 
 #if __cplusplus >= 201703L
 
+/// @copydoc load_from_file(device_t, const char*)
 inline module_t load_from_file(
 	const device_t&                 device,
 	const ::std::filesystem::path&  path)
@@ -298,6 +337,7 @@ inline module_t load_from_file(
 	return load_from_file(device, path.c_str());
 }
 
+/// @copydoc load_from_file(const char*)
 inline module_t load_from_file(
 	const ::std::filesystem::path&  path)
 {
@@ -320,23 +360,19 @@ inline module_t wrap(
 	return module_t{device_id, context_handle, module_handle, take_ownership, hold_pc_refcount_unit};
 }
 
-/*
-template <typename Creator>
-module_t create(const context_t& context, const void* module_data, Creator creator_function);
-*/
-
 /**
  * Creates a new module in a context using raw compiled code
  *
  * @param context The module will exist within this GPU context, i.e. the globals (functions,
  * variable) of the module would be usable within that constant.
  * @param module_data The raw compiled code for the module.
- * @param link_options Potential options for the PTX compilation and device linking of the code.
+ * @param link_options Potential options for the PTX compilation and linking of the compiled
+ *     device-side code.
  */
-///@{
 module_t create(const context_t& context, const void* module_data, const link::options_t& link_options);
+
+/// @copydoc create(const context_t&, const void*, const link::options_t&)
 module_t create(const context_t& context, const void* module_data);
-///@}
 
 inline void destroy(handle_t handle, context::handle_t context_handle, device::id_t device_id)
 {
@@ -362,7 +398,17 @@ inline device::primary_context_t get_context_for(device_t& locus);
 
 } // namespace detail_
 
-// Note: The following may create the primary context of a device!
+/**
+ * Create a new module - in a specified context or in a device's primary context,
+ * using raw module data in memory.
+ *
+ * @tparam Locus Either a @ref cuda::device_t or a {@ref cuda::context_t}.
+ * @tparam ContiguousContainer A span, a vector, a unique_span, or similar type
+ * @param locus Where the new module should be created
+ * @param module_data The raw data for the module in locus-accessible memory.
+ *
+ * @note This function may create/allocate resources for the primary context of a device!
+ */
 template <typename Locus, typename ContiguousContainer,
 	cuda::detail_::enable_if_t<cuda::detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value, bool>>
 module_t create(
@@ -373,6 +419,14 @@ module_t create(
 	return detail_::create(context, module_data.data());
 }
 
+/**
+ * @copydoc create(Locus&&,	ContiguousContainer)
+ *
+ * @param link_options Options for PTX compilation and for linking the module data,
+ * eventually.
+ *
+ * @return
+ */
 // Note: The following may create the primary context of a device!
 template <typename Locus, typename ContiguousContainer,
 	cuda::detail_::enable_if_t<cuda::detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value, bool>>
diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
index 34fb9c5f..3416def8 100644
--- a/src/cuda/api/multi_wrapper_impls/unique_region.hpp
+++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
@@ -72,6 +72,14 @@ inline unique_region make_unique_region(size_t num_bytes)
 
 namespace managed {
 
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @param context A context of possible single-device-visibility
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
 	const context_t&      context,
 	size_t                num_bytes,
@@ -81,6 +89,14 @@ inline unique_region make_unique_region(
 	return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) };
 }
 
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @param context A context of possible single-device-visibility
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
 	const device_t&       device,
 	size_t                num_bytes,
diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
index c2907c63..0c0ee18b 100644
--- a/src/cuda/api/multi_wrapper_impls/unique_span.hpp
+++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
@@ -28,7 +28,7 @@ unique_span<T> make_unique_span(const context_t& context, size_t num_elements)
 }
 
 /**
- * @brief Create a variant of ::std::unique_pointer for an array in
+ * @brief Allocate (but do)
  * device-global memory
  *
  * @tparam T  an array type; _not_ the type of individual elements
diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp
index 8c8e56ae..9518e941 100644
--- a/src/cuda/api/pci_id.hpp
+++ b/src/cuda/api/pci_id.hpp
@@ -53,6 +53,8 @@ struct pci_location_t {
 	 * and any of them can be used.
 	 */
 	static pci_location_t parse(const ::std::string& id_str);
+
+	/// @copydoc parse(const ::std::string& id_str)
 	static pci_location_t parse(const char* id_str);
 };
 
diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp
index 68c718b1..cc7c3f93 100644
--- a/src/cuda/api/peer_to_peer.hpp
+++ b/src/cuda/api/peer_to_peer.hpp
@@ -14,17 +14,31 @@ namespace cuda {
 
 namespace device {
 
+/**
+ * @namespace peer_to_peer
+ *
+ * API functions and definitions relating to communications among peer CUDA GPU devices
+ * on the same system.
+ */
 namespace peer_to_peer {
 
-// Aliases for all CUDA device attributes
+/// Aliases for CUDA driver GPU attribute codes
+///@{
+
+/// A relative value indicating the performance of the link between two devices
+constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK;
+
+/// 1 if access is supported, 0 otherwise
+constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED;
+
+/// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
+constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED;
 
-constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices
-constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise
-constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
 #if CUDA_VERSION >= 10000
-constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise
+/// 1 if special array interpolatory access operations are supported across the link, 0 otherwise
+constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED;
 #endif
-
+///@}
 
 namespace detail_ {
 /**
@@ -94,6 +108,12 @@ void disable_access_to(const context_t &peer_context);
 
 } // namespace current
 
+/**
+ * @namespace peer_to_peer
+ *
+ * API functions and definitions relating to communications among "peer" contexts on
+ * the same system, which may possibly regard different CUDA devices.
+ */
 namespace peer_to_peer {
 
 namespace detail_ {
diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
index 2e573b72..1688b8d3 100644
--- a/src/cuda/api/pointer.hpp
+++ b/src/cuda/api/pointer.hpp
@@ -120,7 +120,7 @@ inline memory::type_t type_of(const void* ptr)
 
 /// Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated
 /// (e.g. being the result of an allocation or mapping in that context)
-inline context_t context_of(const void* ptr);
+context_t context_of(void const* ptr);
 
 /**
  * A convenience wrapper around a raw pointer "known" to the CUDA runtime
diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp
index db59acc5..4bbaff92 100644
--- a/src/cuda/api/primary_context.hpp
+++ b/src/cuda/api/primary_context.hpp
@@ -75,6 +75,12 @@ inline void increase_refcount(device::id_t device_id)
 
 } // namespace detail_
 
+/**
+ * @returns true if the device's primary context is active (i.e. has resources allocated for it),
+ * which implies we are holding a refcount unit for it somewhere.
+ *
+ * @note recall a primary context being active does not mean that it is the _current_ context
+ */
 inline bool is_active(const device_t& device);
 
 /**
@@ -140,6 +146,9 @@ class primary_context_t : public context_t {
 
 public:
 
+	/// @return a stream object for the default-ID stream of the device, which
+	/// is pre-created and on which actions are scheduled when the runtime API
+	/// is used and no stream is specified.
 	stream_t default_stream() const noexcept;
 
 public: // friendship
@@ -280,6 +289,7 @@ inline bool is_current(device::id_t device_id)
 
 } // namespace detail
 
+/// @return true if the current context is its device's primary context
 inline bool is_current()
 {
 	auto device_id = context::current::detail_::get_device_id();
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index e9896a9a..ab57b314 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -50,6 +50,11 @@ enum : bool {
 	nonblocking = async,
 };
 
+/**
+ * Kinds of conditions to apply to a value in GPU global memory
+ * when waiting on that value, i.e. on what condition to stop
+ * waiting.
+ */
 enum wait_condition_t : unsigned {
 	greater_or_equal_to            = CU_STREAM_WAIT_VALUE_GEQ,
 	geq                            = CU_STREAM_WAIT_VALUE_GEQ,
@@ -173,6 +178,9 @@ void enqueue_function_call(const stream_t& stream, Function function, void * arg
  *     the current context and outlasting it. When set to `true`,
  *     the proxy class will act as it does usually, destroying the stream
  *     when being destructed itself.
+ * @param hold_pc_refcount_unit when the stream's context is a device's primary
+ *     context, this controls whether that context must be kept active while the
+ *     stream continues to exist
  * @return an instance of the stream proxy class, with the specified
  *     device-stream combination.
  */
@@ -255,6 +263,8 @@ class stream_t {
 		return flags & CU_STREAM_NON_BLOCKING;
 	}
 
+	/// @returns the execution priority of a tasks on this stream (relative to other
+	/// tasks in other streams on the same device
 	stream::priority_t priority() const
 	{
 		int the_priority;
@@ -321,8 +331,21 @@ class stream_t {
 		const stream_t& associated_stream;
 
 	public:
+		///@cond
 		enqueue_t(const stream_t& stream) : associated_stream(stream) {}
+		///@nocond
 
+		/**
+		 * Schedule a kernel launch on the associated stream
+		 *
+		 * @param kernel A wrapper around the kernel to launch
+		 * @param launch_configuration A description of how to launch the kernel (e.g.
+		 *     block and grid dimensions).
+		 * @param parameters to arguments to be passed to the kernel for this launch
+		 *
+		 * @note This function is cognizant of the types of all arguments passed to it;
+		 * for a type-erased version, see @ref type_erased_kernel_launch()
+		 */
 		template<typename KernelFunction, typename... KernelParameters>
 		void kernel_launch(
 			const KernelFunction&       kernel_function,
@@ -336,6 +359,19 @@ class stream_t {
 				::std::forward<KernelParameters>(parameters)...);
 		}
 
+		/**
+		 * Schedule a kernel launch on the associated stream
+		 *
+		 * @param kernel A wrapper around the kernel to launch
+		 * @param launch_configuration A description of how to launch the kernel (e.g.
+		 *     block and grid dimensions).
+		 * @param marshalled_arguments Pointers to arguments to be passed to the kernel
+		 *     for this launch
+		 *
+		 * @note This signature does not require any type information regarding the kernel
+		 * function type; see @ref kernel_launch() for a type-observing version of the
+		 * same schedulign operation.
+		 */
 		void type_erased_kernel_launch(
 			const kernel_t&         kernel,
 			launch_configuration_t  launch_configuration,
@@ -345,21 +381,14 @@ class stream_t {
 		}
 
 		/**
-		 * Have the CUDA device perform an I/O operation between two specified
-		 * memory regions (on or off the actual device)
+		 * Copy operations
 		 *
+		 * The source and destination memory regions may be anywhere the CUDA driver can
+		 * map  (e.g. the device's global memory, host/system memory, the global memory of
+		 * another device, constant memory etc.)
 		 */
-
 		///@{
-		/**
-		 * @param destination destination region into which to copy. May be
-		 * anywhere in which memory can be mapped to the device's memory space (e.g.
-		 * the device's global memory, host memory or the global memory of another device)
-		 * @param source destination region from which to copy. May be
-		 * anywhere in which memory can be mapped to the device's memory space (e.g.
-		 * the device's global memory, host memory or the global memory of another device)
-		 * @param num_bytes size of the region to copy
-		 **/
+		/// Schedule a copy of one region of memory to another
 		void copy(void *destination, const void *source, size_t num_bytes) const
 		{
 			// CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
@@ -368,6 +397,7 @@ class stream_t {
 			memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(void* destination, memory::const_region_t source, size_t num_bytes) const
 		{
 #ifndef NDEBUG
@@ -378,16 +408,23 @@ class stream_t {
 			copy(destination, source.start(), num_bytes);
 		}
 
+		/**
+		 * @copybrief copy(void *, const void *, size_t) const
+		 *
+		 * @note @p num_bytes may be smaller than the sizes of any of the regions
+		 */
 		void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
 		{
 			copy(destination.start(), source, num_bytes);
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(memory::region_t destination, memory::const_region_t source) const
 		{
 			copy(destination, source, source.size());
 		}
 
+		/// @copybrief copy(void *, const void *, size_t) const
 		void copy(void* destination, memory::const_region_t source) const
 		{
 			copy(destination, source, source.size());
@@ -398,9 +435,9 @@ class stream_t {
 		 * Set all bytes of a certain region in device memory (or unified memory,
 		 * but using the CUDA device to do it) to a single fixed value.
 		 *
-		 * @param destination Beginning of the region to fill
+		 * @param start Beginning of the region to fill
 		 * @param byte_value the value with which to fill the memory region bytes
-		 * @param num_bytes size of the region to fill
+		 * @param num_bytes size in bytes of the region to fill
 		 */
 		void memset(void *start, int byte_value, size_t num_bytes) const
 		{
@@ -409,6 +446,7 @@ class stream_t {
 			memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
 		}
 
+		/// @copydoc memset(void *, int, size_t) const
 		void memset(memory::region_t region, int byte_value) const
 		{
 			memset(region.data(), byte_value, region.size());
@@ -422,7 +460,7 @@ class stream_t {
 		 * API call for setting to zero; does that mean there are special facilities
 		 * for zero'ing memory faster? Who knows.
 		 *
-		 * @param destination Beginning of the region to fill
+		 * @param start Beginning of the region to fill
 		 * @param num_bytes size of the region to fill
 		 */
 		void memzero(void *start, size_t num_bytes) const
@@ -431,6 +469,9 @@ class stream_t {
 			memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_);
 		}
 
+		/**
+		 * @copydoc memzero(void *, size_t) const
+		 */
 		void memzero(memory::region_t region) const
 		{
 			memzero(region.data(), region.size());
@@ -492,6 +533,7 @@ class stream_t {
 		}
 
 	public:
+		/// Enqueues a host-invokable object, typically a function or closure object call.
 		template <typename Invokable>
 		void host_invokable(Invokable& invokable) const
 		{
@@ -529,13 +571,16 @@ class stream_t {
 			memory::device::async::free(associated_stream, region);
 		}
 #endif
-		///@{
 
 		/**
 		 * Sets the attachment of a region of managed memory (i.e. in the address space visible
 		 * on all CUDA devices and the host) in one of several supported attachment modes.
 		 *
-		 * The attachmentis actually a commitment vis-a-vis the CUDA driver and the GPU itself
+		 * @param managed_region_start a pointer to the beginning of the managed memory region.
+		 * This cannot be a pointer to anywhere in the middle of an allocated region - you must
+		 * pass whatever @ref cuda::memory::managed::allocate() returned.
+		 *
+		 * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself
 		 * that it doesn't need to worry about accesses to this memory from devices other than
 		 * its object of attachment, so that the driver can optimize scheduling accordingly.
 		 *
@@ -547,12 +592,6 @@ class stream_t {
 		 * the attachment goes into effect (some time after) previous scheduled actions have
 		 * concluded.
 		 */
-		///@{
-		/**
-		 * @param managed_region_start a pointer to the beginning of the managed memory region.
-		 * This cannot be a pointer to anywhere in the middle of an allocated region - you must
-		 * pass whatever @ref cuda::memory::managed::allocate() returned.
-		 */
 		void attach_managed_region(
 			const void* managed_region_start,
 			memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
@@ -572,8 +611,23 @@ class stream_t {
 		}
 
 		/**
-		 * @param region the managed memory region to attach; it cannot be a sub-region -
-		 * you must pass whatever @ref cuda::memory::managed::allocate() returned.
+		 * @copybrief attach_managed_region(const void*, memory::managed::attachment_t) const
+		 *
+		 * @param region the entire managed memory region; note this must not be a sub-region;
+		 * you must pass whatever the CUDA memory allocation or construction code provided
+		 * you with, in full.
+		 *
+		 * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself
+		 * that it doesn't need to worry about accesses to this memory from devices other than
+		 * its object of attachment, so that the driver can optimize scheduling accordingly.
+		 *
+		 * @note by default, the memory region is attached to this specific stream on its
+		 * specific device. In this case, the host will be allowed to read from this memory
+		 * region whenever no kernels are pending on this stream.
+		 *
+		 * @note Attachment happens asynchronously, as an operation on this stream, i.e.
+		 * the attachment goes into effect (some time after) previous scheduled actions have
+		 * concluded.
 		 */
 		void attach_managed_region(
 			memory::region_t region,
@@ -581,8 +635,6 @@ class stream_t {
 		{
 			attach_managed_region(region.start(), attachment);
 		}
-		///@}
-
 
 		/**
 		 * Will pause all further activity on the stream until the specified event has
@@ -862,16 +914,21 @@ class stream_t {
 		// it must release its refcount unit on destruction
 
 public: // data members - which only exist in lieu of namespaces
+
+	/// This data member is a gadget for use instead of a "class-local" namespace;
+	/// we do not need it as a distinct object
 	const enqueue_t     enqueue { *this };
 		// The use of *this here is safe, since enqueue_t doesn't do anything with it
 		// on its own. Any use of enqueue only happens through, well, *this - and
 		// after construction.
 };
 
+///@cond
 inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
 {
 	return not (lhs == rhs);
 }
+///@nocond
 
 namespace stream {
 
@@ -972,16 +1029,7 @@ void enqueue_function_call(const stream_t& stream, Function function, void* argu
  *     for execution scheduling; lower numbers represent higher properties;
  *     each device has a range of priorities, which can be obtained using
  *     @ref device_t::stream_priority_range()
- * @param hold_pc_refcount_unit when the event's context is a device's primary
- *     context, this controls whether that context must be kept active while the
- *     event continues to exist
-  * @return The newly-created stream
- */
-///@{
-
-/**
- * @brief Create a new stream (= queue) in the primary execution context
- * of a CUDA device.
+ * @return The newly-created stream
  */
 stream_t create(
 	const device_t&   device,
@@ -992,6 +1040,16 @@ stream_t create(
  * @brief Create a new stream (= queue) in a CUDA execution context.
  *
  * @param context the execution context in which to create the stream
+ * @param synchronizes_with_default_stream if true, no work on this stream
+ *     will execute concurrently with work from the default stream (stream 0)
+ * @param priority priority of tasks on the stream, relative to other streams,
+ *     for execution scheduling; lower numbers represent higher properties;
+ *     each device has a range of priorities, which can be obtained using
+ *     @ref device_t::stream_priority_range()
+ * @param hold_pc_refcount_unit when the stream's context is a device's primary
+ *     context, this controls whether that context must be kept active while the
+ *     steam continues to exist
+ * @return The newly-created stream
  */
 stream_t create(
 	const context_t&  context,
@@ -1002,7 +1060,16 @@ stream_t create(
 
 } // namespace stream
 
-inline void synchronize(const stream_t& stream)
+/**
+ * Waits for all previously-scheduled tasks on a given stream to conclude,
+ * before returning.
+ *
+ * Depending on the `host_thread_sync_scheduling_policy_t` set for the
+ * specified stream, the thread calling this method will either yield,
+ * spin or block until all tasks scheduled previously scheduled on the
+ * stream have concluded.
+ */
+ inline void synchronize(const stream_t& stream)
 {
 	// Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
 	// and not have trouble acting on a stream in another context - it balks at doing so under
diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp
index c4456948..34a31a47 100644
--- a/src/cuda/api/texture_view.hpp
+++ b/src/cuda/api/texture_view.hpp
@@ -21,6 +21,7 @@ class texture_view;
 
 namespace texture {
 
+/// The CUDA driver's raw, opaque handle for texture objects
 using raw_handle_t = CUtexObject;
 
 /**
@@ -111,7 +112,6 @@ class texture_view {
 		other.owning_ = false;
 	};
 
-
 	template <typename T, dimensionality_t NumDimensions>
 	texture_view(
 		const cuda::array_t<T, NumDimensions>& arr,
@@ -160,7 +160,10 @@ class texture_view {
 
 public: // non-mutating getters
 
+	/// @returns A non-owning proxy object for the CUDA context in which this texture is defined
 	context_t context() const;
+
+	/// @returns A non-owning proxy object for the CUDA device on which this texture is defined
 	device_t device() const;
 
 public: // friendship
@@ -174,7 +177,7 @@ class texture_view {
 	bool owning_;
 };
 
-
+///@cond
 inline bool operator==(const texture_view& lhs, const texture_view& rhs) noexcept
 {
 	return lhs.raw_handle() == rhs.raw_handle();
@@ -184,7 +187,7 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep
 {
 	return lhs.raw_handle() != rhs.raw_handle();
 }
-
+///@nocond
 namespace texture {
 
 inline texture_view wrap(
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index a569015e..448f0ea7 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -2,7 +2,7 @@
  * @file
  *
  * @brief Fundamental CUDA-related type definitions.
-
+ *
  * This is a common file for all definitions of fundamental CUDA-related types,
  * some shared by different APIs.
  *
@@ -64,16 +64,23 @@ namespace cuda {
  */
 using status_t = CUresult;
 
+/// A size type for use throughout the wrappers library (except when specific API functions
+/// limit the size further)
 using size_t = ::std::size_t;
 
-/**
- * The index or number of dimensions of an entity (as opposed to the extent in any
- * dimension) - typically just 0, 1, 2 or 3.
- */
+/// The index or number of dimensions of an entity (as opposed to the extent in any
+/// dimension) - typically just 0, 1, 2 or 3.
 using dimensionality_t = size_t;
 
+/**
+ * @namespace array
+ *
+ * CUDA facilities for interpolating access to multidimensional array objects,
+ * in particular via the @ref array_t class.
+ */
 namespace array {
 
+/// An individual dimension extent for an array
 using dimension_t = size_t;
 
 /**
@@ -87,12 +94,11 @@ using dimension_t = size_t;
 template<dimensionality_t NumDimensions>
 struct dimensions_t;
 
-/**
- * Dimensions for 3D CUDA arrays
- */
+/// Dimensions for 3D CUDA arrays
 template<>
 struct dimensions_t<3> // this almost-inherits cudaExtent
 {
+	/// The three constituent individual dimensions, named
 	dimension_t width, height, depth;
 
 	constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_, dimension_t depth_)
@@ -117,25 +123,32 @@ struct dimensions_t<3> // this almost-inherits cudaExtent
 			// 2. It doesn't do anything except construct the plain struct - as of CUDA 10 at least
 	}
 
+	/// The total number of elements in a 3D entity with these dimensions
 	constexpr __host__ __device__ size_t volume() const { return width * height * depth; }
+
+	/// @copydoc volume
 	constexpr __host__ __device__ size_t size() const { return volume(); }
+
+	/// The number of non-trivial dimensions in this object (axes in which an object with
+	/// these dimensions is not "flat")
 	constexpr __host__ __device__ dimensionality_t dimensionality() const
 	{
 		return ((width > 1) + (height> 1) + (depth > 1));
 	}
 
-	// Named constructor idioms
-
+	/// Named constructor idiom: Dimensions for an equi-lateral cube
 	static constexpr __host__ __device__ dimensions_t cube(dimension_t x)   { return dimensions_t{ x, x, x }; }
+
+	///Named constructor idiom: Dimensions for a one-element object: "Flat" in all individual
+	// dimensions
 	static constexpr __host__ __device__ dimensions_t zero() { return cube(0); }
 };
 
-/**
- * Dimensions for 2D CUDA arrays
- */
+/// Dimensions for 2D CUDA arrays
 template<>
 struct dimensions_t<2>
 {
+	/// The two constituent individual dimensions, named; no "depth" for the 2D case.
 	dimension_t width, height;
 
 	constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_)
@@ -159,8 +172,14 @@ struct dimensions_t<2>
 		return *this;
 	}
 
+	/// The total number of elements in a 2D entity with these dimensions
 	constexpr __host__ __device__ size_t area() const { return width * height; }
+
+	/// @copydoc area
 	constexpr __host__ __device__ size_t size() const { return area(); }
+
+	/// The number of non-trivial dimensions in this object (axes in which an object with
+	/// these dimensions is not "flat")
 	constexpr __host__ __device__ dimensionality_t dimensionality() const
 	{
 		return ((width > 1) + (height> 1));
@@ -168,21 +187,33 @@ struct dimensions_t<2>
 
 	// Named constructor idioms
 
+	/// Named constructor idiom: Dimensions for an equi-lateral cube
 	static constexpr __host__ __device__ dimensions_t square(dimension_t x)   { return dimensions_t{ x, x }; }
+
+	///Named constructor idiom: Dimensions for a one-element object: "Flat" in all individual
+	// dimensions
 	static constexpr __host__ __device__ dimensions_t zero() { return square(0); }
 };
 
 } // namespace array
 
 /**
- * @brief Definitions and functionality related to CUDA events (not
- * including the event wrapper type @ref event_t itself)
+ * @namespace event
+ *
+ * CUDA timing functionality, via events and their related code (not including
+ * the event wrapper type @ref event_t itself)
  */
 namespace event {
 
 /// The CUDA driver's raw handle for events
 using handle_t = CUevent;
 
+/**
+ * @namespace event::ipc
+ *
+ * Definitions and functionality related to CUDA events (not
+ * including the event wrapper type @ref event_t itself)
+ */
 namespace ipc {
 
 /// The CUDA driver's raw handle for events passed between processes
@@ -193,10 +224,11 @@ using handle_t = CUipcEventHandle;
 } // namespace event
 
 /**
- * @brief Definitions and functionality related to CUDA streams (not
+ * @namespace stream
+ *
+ * Definitions and functionality related to CUDA streams (not
  * including the device wrapper type @ref stream_t itself)
  */
-
 namespace stream {
 
 /// The CUDA driver's raw handle for streams
@@ -279,7 +311,9 @@ struct dimensions_t // this almost-inherits dim3
 	// as constexpr, so it isn't
 	__host__ __device__ operator dim3(void) const { return { x, y, z }; }
 
+	/// The number of total elements in a 3D object with these dimensions
 	constexpr __host__ __device__ size_t volume() const { return static_cast<size_t>(x) * y * z; }
+
 	/// Number of dimensions in which this dimension structure is non-trivial, i.e. coordinates can
 	/// have more than a single value
 	constexpr __host__ __device__ dimensionality_t dimensionality() const
@@ -289,11 +323,20 @@ struct dimensions_t // this almost-inherits dim3
 
 	// Named constructor idioms
 
+	/// Dimensions of an equi-lateral 3D cube
 	static constexpr __host__ __device__ dimensions_t cube(dimension_t x)   { return dimensions_t{ x, x, x }; }
+
+	/// Dimensions of an equi-lateral 2D square, with a trivial third dimension
 	static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x, 1 }; }
+
+	/// Dimensions of a 1D line, with the last two dimensions trivial
 	static constexpr __host__ __device__ dimensions_t line(dimension_t x)   { return dimensions_t{ x, 1, 1 }; }
+
+	/// Dimensions of a single point - trivial in in all axes
 	static constexpr __host__ __device__ dimensions_t point()               { return dimensions_t{ 1, 1, 1 }; }
 
+	/// @returns true if the dimensions on the left-hand side divide, elementwise, those
+	/// on the right-hand side
 	static bool divides(dimensions_t lhs, dimensions_t rhs)
 	{
 		return
@@ -378,6 +421,7 @@ struct overall_dimensions_t
 	}
 };
 
+///@cond
 constexpr bool operator==(overall_dimensions_t lhs, overall_dimensions_t rhs) noexcept
 {
 	return (lhs.x == rhs.x) and (lhs.y == rhs.y) and (lhs.z == rhs.z);
@@ -396,6 +440,7 @@ constexpr overall_dimensions_t operator*(dimensions_t grid_dims, block_dimension
 		grid_dims.z * overall_dimension_t { block_dims.z },
 	};
 }
+///@nocond
 
 /**
  * Composite dimensions for a grid - in terms of blocks, then also down
@@ -405,18 +450,24 @@ struct composite_dimensions_t {
 	grid::dimensions_t       grid;
 	grid::block_dimensions_t block;
 
+	/// @returns The overall dimensions of the entire grid as a single 3D entity
 	constexpr overall_dimensions_t flatten() const { return grid * block; }
 
+	/// @returns The total number of threads over all blocks of the grid
 	constexpr size_t volume() const { return flatten().volume(); }
 
+	/// @returns the number of axes in which the grid overall has non-trivial dimension
 	constexpr size_t dimensionality() const { return flatten().dimensionality(); }
 
+	/// A named constructor idiom for the composite dimensions of a single-block grid
+	/// with a single-thread block
 	static constexpr composite_dimensions_t point()
 	{
 		return { dimensions_t::point(), block_dimensions_t::point() };
 	}
 };
 
+///@cond
 constexpr bool operator==(composite_dimensions_t lhs, composite_dimensions_t rhs) noexcept
 {
 	return (lhs.grid == rhs.grid) and (lhs.block == rhs.block);
@@ -426,6 +477,7 @@ constexpr bool operator!=(composite_dimensions_t lhs, composite_dimensions_t rhs
 {
 	return not (lhs == rhs);
 }
+///@nocond
 
 } // namespace grid
 
@@ -438,6 +490,7 @@ constexpr bool operator!=(composite_dimensions_t lhs, composite_dimensions_t rhs
 namespace memory {
 
 #if CUDA_VERSION >= 10020
+/// Named boolean constants used in memory read and write access control
 enum : bool {
 	read_enabled = true,
 	read_disabled = false,
@@ -445,10 +498,19 @@ enum : bool {
 	write_disabled = false
 };
 
+/**
+ * An access permission specification structure for physical allocations
+ * performed by / registered with the CUDA driver
+ *
+ * @note defined for each of use instead of the raw CUDA driver's @ref CUmemAccess_flags
+ * type.
+ */
 struct access_permissions_t {
 	bool read : 1;
 	bool write : 1;
 
+	/// This allows passing this access permissions structure to CUDA driver memory
+	/// access control API functions
 	operator CUmemAccess_flags() const noexcept
 	{
 		return read ?
@@ -456,6 +518,7 @@ struct access_permissions_t {
 			   CU_MEM_ACCESS_FLAGS_PROT_NONE;
 	}
 
+	/// A named constructor idiom for access permissions
 	static access_permissions_t from_access_flags(CUmemAccess_flags access_flags)
 	{
 		access_permissions_t result;
@@ -464,6 +527,7 @@ struct access_permissions_t {
 		return result;
 	}
 
+	/// A named constructor idiom for allowing both reading and writing
 	static constexpr access_permissions_t read_and_write()
 	{
 		return access_permissions_t{ read_enabled, write_enabled };
@@ -472,7 +536,8 @@ struct access_permissions_t {
 
 namespace physical_allocation {
 
-// TODO: Consider simply aliasing CUmemAllocationHandleType and using constexpr const's or anonymous enums
+/// The different kinds of memory handles the CUDA driver recognizes, and can possibly
+/// utilize
 enum class shared_handle_kind_t : ::std::underlying_type<CUmemAllocationHandleType>::type {
 #if CUDA_VERSION >= 11020
 	no_export             = CU_MEM_HANDLE_TYPE_NONE,
@@ -494,6 +559,7 @@ template <> struct shared_handle_type_helper<shared_handle_kind_t::win32_handle>
 
 } // namespace detail_
 
+/// The raw handle for different kinds of shared memory the CUDA driver recognizes
 template<shared_handle_kind_t SharedHandleKind>
 using shared_handle_t = typename detail_::shared_handle_type_helper<SharedHandleKind>::type;
 
@@ -501,12 +567,22 @@ using shared_handle_t = typename detail_::shared_handle_type_helper<SharedHandle
 #endif // CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 11020
 
-namespace pool {
 /**
- * @note Unsupported for now
+ * @namespace pool
+ *
+ * pool-based allocation functionality
  */
+namespace pool {
+
+/// Raw CUDA driver handle for a memory pool; avoid using these and prefer @ref memory::pool_t}.
 using handle_t = CUmemoryPool;
+
+/// The different kinds of memory handles the CUDA driver recognizes, and can
+/// possibly be used with pool allocation
 using shared_handle_kind_t = physical_allocation::shared_handle_kind_t;
+
+/// The raw handle for different kinds of shared memory the CUDA driver recognizes, for
+/// possible use with memory pools
 using physical_allocation::shared_handle_t;
 
 } // namespace pool
@@ -514,23 +590,26 @@ using physical_allocation::shared_handle_t;
 
 namespace pointer {
 
+/// Raw CUDA driver choice type for attributes of pointers
 using attribute_t = CUpointer_attribute;
 
 } // namespace pointer
 
 namespace device {
 
-/**
- * The numeric type which can represent the range of memory addresses on a CUDA device.
- */
+/// The numeric type which can represent the range of memory addresses on a CUDA device.
+/// As these addresses are typically just part of the single, unified all-system memory
+/// space, this should be the same type as a system memory address' numeric equivalent.
+
 using address_t = CUdeviceptr;
 
 static_assert(sizeof(void *) == sizeof(device::address_t), "Unexpected address size");
 
 /**
- * Return a pointers address as a numeric value of the type appropriate for device
- * @param device_ptr a pointer into device memory
- * @return a reinterpretation of @p device_address as a numeric address.
+ * @returns a cast of a proper pointer into a numeric address in device memory space
+ * (which is usually just a part of the unified all-system memory space)
+ *
+ * @note Typically, this is just a reinterpretation of the same value.
  */
 inline address_t address(const void* device_ptr) noexcept
 {
@@ -538,16 +617,36 @@ inline address_t address(const void* device_ptr) noexcept
 	return reinterpret_cast<address_t>(device_ptr);
 }
 
+/**
+ * @returns The numeric address of the beginning of a memory region
+ *
+ * @note Typically, this is just a reinterpretation of the same value.
+ */
 inline address_t address(memory::const_region_t region) noexcept { return address(region.start()); }
 
 } // namespace device
 
+/// @returns a cast of a numeric address in device memory space (which, in recent CUDA
+/// versions, is just a part of the unified all-system memory space) into a proper
+/// pointer.
 inline void* as_pointer(device::address_t address) noexcept
 {
 	static_assert(sizeof(void*) == sizeof(device::address_t), "Incompatible sizes for a void pointer and memory::device::address_t");
 	return reinterpret_cast<void*>(address);
 }
 
+/**
+ * @namespace shared
+ *
+ * A memory space whose contents is shared by all threads in a CUDA kernel block,
+ * but specific to each kernel block separately. Shared memory is stored in a special
+ * area in each of a GPU's individual physical cores (symmetric multiprocessors, or
+ * SM's, in NVIDIA parlance) - which is also the area used for L1 cache. One may
+ * therefore think of it as L1-cache-like memory which holds arbitrary data rather
+ * than a cached copy of any global memory locations. It is only usable in
+ * device-side (= kernel) code, but control and inspection of its size is part of
+ * the CUDA API functionality.
+ */
 namespace shared {
 
 /**
@@ -566,19 +665,42 @@ using size_t = unsigned;
 
 } // namespace shared
 
+/**
+ * @namespace managed
+ *
+ * Paged memory accessible in both device-side and host-side code by triggering transfers
+ * of pages between physical system memory and physical device memory.
+ *
+ * This type of memory, also known as _unified_ memory, appears within a unified, all-system
+ * address space - and is used with the same address range on the host and on all relevant
+ * CUDA devices on a system. It is paged, so that it may exceed the physical size of a CUDA
+ * device's global memory. The CUDA driver takes care of "swapping" pages "out" from a device
+ * to host memory or "swapping" them back "in", as well as of propagation of changes between
+ * devices and host-memory.
+ *
+ * @note For more details, see
+ * <a href="https://devblogs.nvidia.com/parallelforall/unified-memory-cuda-beginners/">
+ * Unified Memory for CUDA Beginners</a> on the
+ * <a href="https://devblogs.nvidia.com/parallelforall">Parallel4All blog</a>.
+ */
 namespace managed {
 
+/// The choices of which categories CUDA devices must a managed memory region be visible to
 enum class initial_visibility_t {
 	to_all_devices,
 	to_supporters_of_concurrent_managed_access,
 };
 
+/// A specifier of one of the attributes of managed memory regions
 using range_attribute_t = CUmem_range_attribute;
 
 } // namespace managed
 
 #if CUDA_VERSION >= 11070
+/// Memory barriers can apply to different scops of scheduled work which must reach it before
+/// continuing
 enum class barrier_scope_t : typename ::std::underlying_type<CUstreamMemoryBarrier_flags>::type {
+	/// All wortk on
 	device = CU_STREAM_MEMORY_BARRIER_TYPE_GPU,
 	system = CU_STREAM_MEMORY_BARRIER_TYPE_SYS
 };
@@ -586,10 +708,13 @@ enum class barrier_scope_t : typename ::std::underlying_type<CUstreamMemoryBarri
 
 #if CUDA_VERSION >= 10000
 /**
+ * @namespace memory::external
+ *
  * Representation of memory resources external to CUDA
  */
 namespace external {
 
+/// Raw CUDA driver handle for an external memory resource represented for the driver
 using handle_t = CUexternalMemory;
 
 /**
@@ -692,15 +817,13 @@ using attribute_t = CUdevice_P2PAttribute;
 
 namespace context {
 
+/// Raw CUDA driver handle for a context; see {@ref context_t}.
 using handle_t = CUcontext;
 
 using flags_t = unsigned;
 
-/**
- * Scheduling policies the Runtime API may use when the host-side
- * thread it is running in needs to wait for results from a certain
- * device
- */
+/// Scheduling policies the CUDA driver may use when the host-side thread it is
+/// running in needs to wait for results from a certain device or context.
 enum host_thread_sync_scheduling_policy_t : unsigned int {
 
 	/**
@@ -762,10 +885,12 @@ using flags_t = context::flags_t;
 
 namespace primary_context {
 
+/// Raw CUDA driver handle for a device's primary context
 using handle_t = cuda::context::handle_t;
 
 } // namespace primary_context
 
+/// @copydoc context::host_thread_sync_scheduling_policy_t
 using host_thread_sync_scheduling_policy_t = context::host_thread_sync_scheduling_policy_t;
 
 } // namespace device
@@ -785,21 +910,27 @@ inline T identity_cast(U&& x)
 
 } // namespace detail_
 
+/// The CUDA-driver-specific representation of a UUID value; see also {@ref device_t::uuid()}
 using uuid_t = CUuuid;
 
 namespace module {
 
+/// Raw CUDA driver handle of a module of compiled code; see @ref module_t
 using handle_t = CUmodule;
 
 } // namespace module
 
 namespace kernel {
 
+/// Raw CUDA driver selector of a kernel attribute
 using attribute_t = CUfunction_attribute;
+
+/// The uniform type the CUDA driver uses for all kernel attributes; it is typically more
+/// appropriate to use @ref cuda::kernel_t methods, which also employ more specific,
+/// appropriate types.
 using attribute_value_t = int;
 
-// TODO: Is this really only for kernels, or can any device-side function be
-// represented by a CUfunction?
+// A raw CUDA driver handle for a kernel; prefer using the @ref cuda::kernel_t type.
 using handle_t = CUfunction;
 
 } // namespace kernel
diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp
index d054217f..a9b65a99 100644
--- a/src/cuda/api/unique_region.hpp
+++ b/src/cuda/api/unique_region.hpp
@@ -151,6 +151,7 @@ class unique_region : public region_t {
 
 namespace device {
 
+/// A unique region of device-global memory
 using unique_region = memory::unique_region<detail_::deleter>;
 
 namespace detail_ {
@@ -164,20 +165,30 @@ inline unique_region make_unique_region(const context::handle_t context_handle,
 } // namespace detail_
 
 /**
- * @brief Allocate an array in device-global memory and return an owning class for it
+ * @brief Allocate a region in device-global memory
  *
- * @param num_bytes  the size in bytes of the allocated region
- */
-///@{
-/**
- * @param device     The CUDA device in whose global memory to make the allocation.
+ * @param context The context within which (and in the device global memory
+ *     of which) to make the allocation
+ * @param num_bytes Size of the region to be allocated, in bytes
+ * @returns An owning RAII/CADRe object for the allocated memory region
  */
 unique_region make_unique_region(const context_t& context, size_t num_bytes);
+
 /**
- * @param context    The CUDA context in which to make the allocation.
+ * @brief Allocate a region in device-global memory
+ *
+ * @param device The device in the global memory of which to make the allocation
+ * @returns An owning RAII/CADRe object for the allocated memory region
  */
 unique_region make_unique_region(const device_t& device, size_t num_bytes);
 
+/**
+ * @brief Allocate a region in device-global memory within the primary context
+ * of the current CUDA device
+ *
+ * @param device The device in the global memory of which to make the allocation
+ * @returns An owning RAII/CADRe object for the allocated memory region
+ */
 unique_region make_unique_region(size_t num_bytes);
 ///}@
 
@@ -198,19 +209,21 @@ inline device::unique_region make_unique_region(const device_t& device, size_t n
 
 namespace host {
 
+/// A unique region of pinned host memory
 using unique_region = memory::unique_region<detail_::deleter>;
 
-inline unique_region make_unique_region(
-	const context_t&    context,
-	size_t              num_bytes,
-	allocation_options  options = allocation_options{});
-inline unique_region make_unique_region(const device_t& device, size_t num_bytes);
+/**
+ * @brief Allocate a physical-address-pinned region of system memory
+ *
+ * @returns An owning RAII/CADRe object for the allocated memory region
+ */
 inline unique_region make_unique_region(size_t num_bytes);
 
 } // namespace host
 
 namespace managed {
 
+/// A unique region of managed memory, see @ref cuda::memory::managed
 using unique_region = memory::unique_region<detail_::deleter>;
 
 namespace detail_ {
@@ -226,14 +239,34 @@ inline unique_region make_unique_region(
 
 } // namespace detail_
 
+/**
+ * @copydoc make_unique_region(size_t num_bytes)
+ *
+ * @param context A context, to set when allocating the memory region, for whatever
+ *     association effect that may have.
+ */
 inline unique_region make_unique_region(
     const context_t&      context,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @copydoc make_unique_region(size_t num_bytes)
+ *
+ * @param device A context, whose primary context will be current when allocating
+ *     the memory region, for whatever association effect that may have.
+ */
 inline unique_region make_unique_region(
     const device_t&       device,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+/**
+ * @brief Allocate a region of managed memory, accessible both from CUDA devices
+ * and from the CPU.
+ *
+ * @returns An owning RAII/CADRe object for the allocated managed memory region
+ */
 inline unique_region make_unique_region(
     size_t                num_bytes);
 
diff --git a/src/cuda/define_specifiers.hpp b/src/cuda/define_specifiers.hpp
index a907ea11..4234041b 100644
--- a/src/cuda/define_specifiers.hpp
+++ b/src/cuda/define_specifiers.hpp
@@ -8,6 +8,8 @@
 
 #ifdef __CUDACC__
 
+/// Shorthands for CUDA-specific function declaration decorations
+///@{
 #ifndef CUDA_FD
 #define CUDA_FD  __forceinline__ __device__
 #endif
@@ -57,5 +59,6 @@
 #ifndef CUDA_H
 #define CUDA_H
 #endif
+///@}
 
 #endif // __CUDACC__
diff --git a/src/cuda/nvtx/profiling.hpp b/src/cuda/nvtx/profiling.hpp
index 546835a3..af8a7518 100644
--- a/src/cuda/nvtx/profiling.hpp
+++ b/src/cuda/nvtx/profiling.hpp
@@ -40,7 +40,6 @@
 #endif
 #endif
 
-
 #include <mutex>
 #include <cstdint>
 #include <string>
@@ -51,10 +50,16 @@
 namespace cuda {
 
 // Note: No implementation for now for nvtxStringHandle_t's
-
+/**
+ * @namespace profiling
+ *
+ * Interaction with NVIDIA's profiler, particularly tagging, marking and
+ * indications of entities it will pick up and register/display.
+ */
 namespace profiling {
 
 namespace detail_ {
+
 inline void set_message(nvtxEventAttributes_t &attrs, const char *c_str) noexcept
 {
 	attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
@@ -75,16 +80,23 @@ inline void set_message(nvtxEventAttributes_t &attrs, nvtxStringHandle_t rsh) no
 
 } // namespace detail_
 
+/**
+ * @namespace range
+ *
+ * definitions related to profiled ranges and the @ref range_t class
+ */
 namespace range {
 
-enum class type_t { unspecified, kernel, pci_express_transfer	};
-
 /**
- * The range handle is actually `nvtxRangeId_t`; but - other than this typedef,
- * we don't need to include the nVIDIA Toolkit Extensions headers at all here,
- * and can leave them within the implementation only.
+ * Types of profiled ranges we recognize
+ *
+ * @note The profiler itself does not distinguish between these types of ranges;
+ * we use them for different styling
  */
-using handle_t = ::std::uint64_t;
+enum class type_t { unspecified, kernel, pci_express_transfer };
+
+/// The raw handle of a CUDA profiling range
+using handle_t = nvtxRangeId_t;
 
 } // namespace range
 
@@ -94,10 +106,18 @@ using handle_t = ::std::uint64_t;
  * profiling information.
  */
 struct color_t {
+	/// A profiler color corresponds to a 32-bit value
 	using underlying_type = ::std::uint32_t;
+
+	/// Each color channel is an 8-bit value
 	using channel_value = ::std::uint8_t;
+
+	/// A profiler color is made up of three color channels and a transparency
+	/// or "alpha" channel
 	channel_value alpha, red, green, blue;
 
+	/// Construct a profiler color value from a numeric value (typically,
+	/// an 8-hex-digit literal)
 	static constexpr color_t from_hex(underlying_type raw_argb) noexcept {
 		return {
 			static_cast<channel_value> ((raw_argb >> 24) & 0xFF),
@@ -106,7 +126,8 @@ struct color_t {
 			static_cast<channel_value> ((raw_argb >>  0) & 0xFF),
 		};
 	}
-	operator underlying_type() const noexcept { return as_hex(); }
+
+	/// @return the numeric value corresponding to this profiler color
 	underlying_type as_hex() const noexcept
 	{
 		return
@@ -115,6 +136,12 @@ struct color_t {
 		static_cast<underlying_type>(green)  <<  8 |
 		static_cast<underlying_type>(blue)   <<  0;
 	}
+
+	/// @copydoc as_hex()
+	operator underlying_type() const noexcept { return as_hex(); }
+
+	/// Some basic colors, for convenience
+	///@{
 	static constexpr color_t Black()       noexcept { return from_hex(0x00000000); }
 	static constexpr color_t White()       noexcept { return from_hex(0x00FFFFFF); }
 	static constexpr color_t FullRed()     noexcept { return from_hex(0x00FF0000); }
@@ -129,8 +156,14 @@ struct color_t {
 	static constexpr color_t DarkGreen()   noexcept { return from_hex(0x00008800); }
 	static constexpr color_t DarkBlue()    noexcept { return from_hex(0x00000088); }
 	static constexpr color_t DarkYellow()  noexcept { return from_hex(0x00888800); }
+	///@}
 };
 
+/**
+ * @namespace mark
+ *
+ * defintions related to marking individual time points in the profiler timeline
+ */
 namespace mark {
 
 namespace detail_ {
@@ -156,6 +189,8 @@ nvtxEventAttributes_t create_attributes(const CharT* description, color_t color)
 
 } // namespace detail_
 
+/// Mark a single point on the profiler timeline, giving
+/// it also a color and some descriptive text
 template <typename CharT>
 void point(const CharT* description, color_t color = color_t::Black())
 {
@@ -165,6 +200,15 @@ void point(const CharT* description, color_t color = color_t::Black())
 	nvtxMarkEx(&attrs);
 }
 
+/**
+ * Mark the beginning of a range on the profiler timeline, giving
+ * it also a color and some descriptive text
+ *
+ * @param type the range type - an unused parameter
+ *
+ * @return a handle representing the range, which can be used to mark its
+ * endpoint
+ */
 template <typename CharT>
 range::handle_t range_start(
 	const CharT*   description,
@@ -180,6 +224,8 @@ range::handle_t range_start(
 	return range_handle;
 }
 
+/// Mark the end of a range, using the handle obtained when previously
+/// marking its beginning.
 inline void range_end(range::handle_t range_handle)
 {
 	static_assert(::std::is_same<range::handle_t, nvtxRangeId_t>::value,
@@ -189,18 +235,14 @@ inline void range_end(range::handle_t range_handle)
 
 } // namespace mark
 
-/**
- * Start CUDA profiling for the current process
- */
+/// Start CUDA profiling for the current process
 inline void start()
 {
 	auto status = cuProfilerStart();
 	throw_if_error_lazy(status, "Starting CUDA profiling");
 }
 
-/**
- * Stop CUDA profiling for the current process
- */
+/// Stop CUDA profiling for the current process
 inline void stop()
 {
 	auto status = cuProfilerStop();
@@ -215,7 +257,9 @@ namespace cuda {
 namespace profiling {
 
 /**
- * A RAII class whose scope of existence is reflected as a range in the profiler.
+ * A RAII/CADRe class whose scope of existence is reflected as a range in the
+ * profiler.
+ *
  * Use it in the scope in which you perform some interesting operation, e.g.
  * perform a synchronous I/O operation (and have it conclude of course), or
  * launch and synch several related kernels.
@@ -333,23 +377,29 @@ inline void name(::std::thread::id host_thread_id, const char* name)
 } // namespace detail_
 
 /**
- * @brief Have the profiler refer to the current thread, or another host
- * thread, using a specified string identifier (rather than its numeric ID).
+ * @brief Have the profiler refer to a given host thread, using a specified string
+ * identifier (rather than its numeric ID).
  *
  * @param[in] host_thread  A C++-recognized thread to name in profiling results
  * @param[in] name The name to use for the specified thread
  */
-///@{
 template <typename CharT>
 void name(const ::std::thread& host_thread, const CharT* name);
 
+/**
+ * @brief Have the profiler refer to the current thread using a specified string
+ * identifier (rather than its numeric ID).
+ *
+ * @param[in] host_thread  A C++-recognized thread to name in profiling results
+ * @param[in] name The name to use for the specified thread
+ */
 template <typename CharT>
 void name_this_thread(const CharT* name)
 {
 	detail_::name(::std::this_thread::get_id(), name);
 }
-///@}
 
+/// Have the profile assign a name to a certain stream
 template <typename CharT>
 void name(const stream_t& stream, const CharT* name)
 {
@@ -357,6 +407,7 @@ void name(const stream_t& stream, const CharT* name)
 	detail_::name_stream(stream.handle(), name);
 }
 
+/// Have the profile assign a name to a certain event
 template <typename CharT>
 void name(const event_t& event, const CharT* name)
 {
@@ -364,6 +415,7 @@ void name(const event_t& event, const CharT* name)
 	detail_::name_stream(event.handle(), name);
 }
 
+/// Have the profile assign a name to a certain CUDA device
 template <typename CharT>
 void name(const device_t& device, const CharT* name)
 {
diff --git a/src/cuda/rtc/compilation_options.hpp b/src/cuda/rtc/compilation_options.hpp
index f24c0adc..c3182c6c 100644
--- a/src/cuda/rtc/compilation_options.hpp
+++ b/src/cuda/rtc/compilation_options.hpp
@@ -62,7 +62,11 @@ inline cpp_dialect_t cpp_dialect_from_name(const char* dialect_name) noexcept(fa
 
 namespace error {
 
+/// Possible ways of handling a potentially problematic finding by the compiler 
+/// in the program source code
 enum handling_method_t { raise_error = 0, suppress = 1, warn = 2 };
+
+/// Errors, or problematic findings, by the compiler are identified by a number of this type
 using number_t = unsigned;
 
 namespace detail_ {
@@ -135,8 +139,7 @@ struct compilation_options_base_t {
 		return set_target(device.compute_capability());
 	}
 	///@}
-
-};
+}; // compilation_options_base_t
 
 /// Commonly-used phrases regarding the optimization level (e.g. from GCC's
 /// command-line arguments), translated into the numeric levels the RTC
@@ -150,7 +153,12 @@ enum : rtc::optimization_level_t {
 	maximum_optimization = O3
 };
 
-
+/**
+ * Options to be passed to one of the NVIDIA JIT compilers along with a program's source code
+ *
+ * @note at the raw API level, the options are passed in a simpler form, less convenient for
+ * modification. This is handled by the @ref program_t class.
+ */
 template <source_kind_t Kind>
 class compilation_options_t;
 
@@ -160,57 +168,111 @@ class compilation_options_t<ptx> final :
 	public common_ptx_compilation_options_t
 {
 public:
+	///@cond
 	using parent = compilation_options_base_t<ptx>;
 	using parent::parent;
+	///@nocond
 
-	/**
-	 * Makes the PTX compiler run without producing any CUBIN output - for verifying
-	 * the input PTX only.
-	 */
+	/// Makes the PTX compiler run without producing any CUBIN output (for PTX verification only)
 	bool parse_without_code_generation { false };
+
+	/// Allow the JIT compiler to perform expensive optimizations using maximum available resources
+	/// (memory and compile-time).
 	bool allow_expensive_optimizations_below_O2 { false };
+
+	/**
+	 * Compile as patch code for CUDA tools.
+     *
+	 * @note :
+	 *
+	 * 1. Cannot Shall not be used in conjunction with @ref parse_without_code_generation
+	 *    or {@ref compile_extensible_whole_program}.
+	 * 2. Some PTX ISA features may not be usable in this compilation mode.
+	 */
 	bool compile_as_tools_patch { false };
+
+	/**
+	 * Expecting only whole-programs to be directly usable, allow some calls to not be resolved
+	 * until device-side linking is performed (see @ref link_t).
+	 */
 	bool compile_extensible_whole_program { false };
+
+	/// Enable the contraction of multiplcations-followed-by-additions (or subtractions) into single
+	/// fused instructions (FMAD, FFMA, DFMA)
 	bool use_fused_multiply_add { true };
+
+	/// Print code generation statistics along with the compilation log
 	bool verbose { false };
+
+	/**
+	 * Prevent the compiler from merging consecutive basic blocks
+	 * (@ref https://en.wikipedia.org/wiki/Basic_block) into a single block.
+	 *
+	 * Normally, the compiler attempts to merge consecutive "basic blocks" as part of its optimization
+	 * process. However, for debuggable code this is very confusing.
+	 */
 	bool dont_merge_basicblocks { false };
+
+	/// The equivalent of suppressing all findings which currently trigger a warning
 	bool disable_warnings { false };
+
+	/// Disable use of the "optimizer constant bank" feature
 	bool disable_optimizer_constants { false };
+
+	/// Prevents the optimizing away of the return instruction at the end of a program (a kernel?),
+	/// making it possible to set a breakpoint just at that point
 	bool return_at_end_of_kernel { false };
+
+	/// Generate relocatable references for variables and preserve relocations generated for them in
+	/// the linked executable.
 	bool preserve_variable_relocations { false };
+
+	/// Warnings about situations likely to result in poor performance
+	/// or other problems.
 	struct {
 		bool double_precision_ops { false };
 		bool local_memory_use { false };
 		bool registers_spill_to_local_memory { false };
-		bool indeterminable_stack_size {true };
+		bool indeterminable_stack_size { true };
 		// Does the PTX compiler library actually support this? ptxas does, but the PTX compilation API
 		// doesn't mention it
 		bool double_demotion { false };
 	} situation_warnings;
+
+	/// Limits on the number of registers which generated object code (of different kinds) is allowed
+	/// to use
 	struct {
 		optional<rtc::ptx_register_count_t> kernel {};
 		optional<rtc::ptx_register_count_t> device_function {};
 	} maximum_register_counts;
 
+	/// Options for fully-specifying a caching mode
 	struct caching_mode_spec_t {
 		optional<caching_mode_t<memory_operation_t::load>> load {};
 		optional<caching_mode_t<memory_operation_t::store>> store {};
 	};
 	struct {
+		/// The caching mode to be used for instructions which don't specify a caching mode
 		caching_mode_spec_t default_ {};
+		/// A potential forcing of the caching mode, overriding even what instructions themselves
+		/// specify
 		caching_mode_spec_t forced {};
 	} caching_modes;
 
+	/// Get a reference to the caching mode the compiler will be told to use as the default, for load
+	/// instructions which don't explicitly specify a particular caching mode.
 	optional<caching_mode_t<memory_operation_t::load>>& default_load_caching_mode() override
 	{
 		return caching_modes.default_.load;
 	}
+
+	/// Get the caching mode the compiler will be told to use as the default, for load instructions
+	/// which don't explicitly specify a particular caching mode.
 	optional<caching_mode_t<memory_operation_t::load>> default_load_caching_mode() const override
 	{
 		return caching_modes.default_.load;
 	}
 
-
 	/**
 	 * Specifies the GPU kernels, or `__global__` functions in CUDA-C++ terms, or `.entry`
 	 * functions in PTX terms, for which code must be generated.
@@ -222,7 +284,7 @@ class compilation_options_t<ptx> final :
 	::std::vector<::std::string>& entries();
 	::std::vector<::std::string>& kernels();
 	::std::vector<::std::string>& kernel_names();
-};
+}; // compilation_options_t<ptx>
 
 template <>
 class compilation_options_t<cuda_cpp> final :
@@ -303,10 +365,8 @@ class compilation_options_t<cuda_cpp> final :
 	 */
 	bool use_fused_multiply_add { true };
 
-	/**
-	 * Make use of fast math operations. Implies use_fused_multiply_add,
-	 * not use_precise_division and not use_precise_square_root.
-	 */
+	/// Make use of fast math operations. Implies use_fused_multiply_add,
+	/// not use_precise_division and not use_precise_square_root.
 	bool use_fast_math { false };
 
 	/**
@@ -316,48 +376,39 @@ class compilation_options_t<cuda_cpp> final :
 	 */
 	bool link_time_optimization { false };
 
-	/**
-	 * Implicitly add the directories of source files (TODO: Which source files?) as
-	 * include file search paths.
-	 */
+	/// Implicitly add the directories of source files (TODO: Which source files?) as include
+	/// file search paths.
 	bool source_dirs_in_include_path { true };
 
-	/**
-	 * Enables more aggressive device code vectorization in the LTO IR optimizer.
-	 */
+	///Enables more aggressive device code vectorization in the LTO IR optimizer.
 	bool extra_device_vectorization { false };
 
-	/**
-	 * Set language dialect to C++03, C++11, C++14 or C++17.
-	 *
-	 */
+	/// The dialect of C++ as which the compiler will be forced to interpret the program source code
 	optional<cpp_dialect_t> language_dialect { };
 
+	/// Preprocessor macros to have the compiler define, without specifying a particular value
 	::std::unordered_set<::std::string> no_value_defines;
+
+	/// Preprocessor macros to tell the compiler to specifically _un_define.
 	::std::unordered_set<::std::string> undefines;
+
+	/// Preprocessor macros to have the compiler define to specific values
 	::std::unordered_map<::std::string,::std::string> valued_defines;
 
+	/// Have the compiler treat all warnings as though they were suppressed, and print nothing
 	bool disable_warnings { false };
 
-	/**
-	 * Treat all kernel pointer parameters as if they had the `restrict` (or `__restrict`) qualifier.
-	 */
+	/// Treat all kernel pointer parameters as if they had the `restrict` (or `__restrict`) qualifier.
 	bool assume_restrict { false };
 
-	/**
-	 * Assume functions without an explicit specification of their execution space are `__device__`
-	 * rather than `__host__` functions.
-	 */
+	/// Assume functions without an explicit specification of their execution space are `__device__`
+	/// rather than `__host__` functions.
 	bool default_execution_space_is_device { false };
 
-	/**
-	 * Display (error) numbers for warning (and error?) messages, in addition to the message itself.
-	 */
+	/// Display (error) numbers for warning (and error?) messages, in addition to the message itself.
 	bool display_error_numbers { true };
 
-	/**
-	 * Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler").
-	 */
+	/// Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler").
 	::std::string ptxas;
 
 	/**
@@ -421,18 +472,23 @@ class compilation_options_t<cuda_cpp> final :
 
 public: // "shorthands" for more complex option setting
 
-	compilation_options_t& set_language_dialect(cpp_dialect_t dialect)
+	/// Let the compiler interpret the program source code using its default-assumption for the
+	/// C++ language dialect
+	compilation_options_t& clear_language_dialect()
 	{
-		language_dialect = dialect;
+		language_dialect = {};
 		return *this;
 	}
 
-	compilation_options_t& clear_language_dialect()
+	/// Set which dialect of the C++ language the compiler will try to interpret
+	/// the program source code as.
+	compilation_options_t& set_language_dialect(cpp_dialect_t dialect)
 	{
-		language_dialect = {};
+		language_dialect = dialect;
 		return *this;
 	}
 
+	/// @copydoc set_language_dialect(cpp_dialect_t)
 	compilation_options_t& set_language_dialect(const char* dialect_name)
 	{
 		return (dialect_name == nullptr or *dialect_name == '\0') ?
@@ -440,6 +496,7 @@ class compilation_options_t<cuda_cpp> final :
 			set_language_dialect(detail_::cpp_dialect_from_name(dialect_name));
 	}
 
+	/// @copydoc set_language_dialect(cpp_dialect_t)
 	compilation_options_t& set_language_dialect(const ::std::string& dialect_name)
 	{
 		return dialect_name.empty() ?
@@ -447,24 +504,30 @@ class compilation_options_t<cuda_cpp> final :
 			set_language_dialect(dialect_name.c_str());
 	}
 
+	/// Ignore compiler findings of the specified number (rather than warnings about
+	/// them or raising an error)
 	compilation_options_t& suppress_error(error::number_t error_number)
 	{
 		error_handling_overrides[error_number] = error::suppress;
 		return *this;
 	}
 
+	/// Treat compiler findings of the specified number as an error (rather than
+	/// suppressing them or just warning about them)
 	compilation_options_t& treat_as_error(error::number_t error_number)
 	{
 		error_handling_overrides[error_number] = error::raise_error;
 		return *this;
 	}
 
+	/// Treat compiler findings of the specified number as warnings (rather than
+	/// raising an error or ignoring them)
 	compilation_options_t& warn_about(error::number_t error_number)
 	{
 		error_handling_overrides[error_number] = error::warn;
 		return *this;
 	}
-};
+}; // compilation_options_t<cuda_cpp>
 
 namespace detail_ {
 
@@ -488,7 +551,6 @@ MarshalTarget& operator<<(MarshalTarget& mt, detail_::opt_start_t<Delimiter>& op
 	return mt;
 }
 
-
 /**
  * Uses the streaming/left-shift operator (<<) to render a delimited sequence of
  * command-line-argument-like options (with or without a value as relevant)
@@ -508,8 +570,7 @@ void process(
 	if (opts.generate_relocatable_device_code)  { marshalled << opt_start << "--compile-only";                  }
 	if (opts.compile_as_tools_patch)            { marshalled << opt_start << "--compile-as-tools-patch";        }
 	if (opts.generate_debug_info)               { marshalled << opt_start << "--device-debug";                  }
-	if (opts.generate_source_line_info)
-	                                            { marshalled << opt_start << "--generate-line-info";            }
+	if (opts.generate_source_line_info)         { marshalled << opt_start << "--generate-line-info";            }
 	if (opts.compile_extensible_whole_program)  { marshalled << opt_start << "--extensible-whole-program";      }
 	if (not opts.use_fused_multiply_add)        { marshalled << opt_start << "--fmad false";                    }
 	if (opts.verbose)                           { marshalled << opt_start << "--verbose";                       }
@@ -671,6 +732,15 @@ void process(
 	}
 }
 
+/**
+ * Finalize a compilation options "building" object into a structure passable to some of the
+ * CUDA JIT compilation APIs
+ *
+ * @tparam Kind The kind of JITable program options to render
+ *
+ * @return A structure of multiple strings, passable to various CUDA APIs, but no longer
+ * easy to modify and manipulate.
+ */
 template <source_kind_t Kind>
 inline marshalled_options_t marshal(const compilation_options_t<Kind>& opts)
 {
@@ -683,6 +753,14 @@ inline marshalled_options_t marshal(const compilation_options_t<Kind>& opts)
 
 } // namespace detail_
 
+/**
+ * Finalize a set of compilation options into the form of a string appendable to a command-line
+ *
+ * @tparam Kind The kind of JITable program options to render
+ *
+ * @return a string made up of command-line options - switches and options with arguments,
+ * designated by single or double dashes.
+ */
 template <source_kind_t Kind>
 inline ::std::string render(const compilation_options_t<Kind>& opts)
 {
diff --git a/src/cuda/rtc/compilation_output.hpp b/src/cuda/rtc/compilation_output.hpp
index 45e457c5..c95359bf 100644
--- a/src/cuda/rtc/compilation_output.hpp
+++ b/src/cuda/rtc/compilation_output.hpp
@@ -23,8 +23,18 @@ class context_t;
 ///@endcond
 
 namespace rtc {
+
+/**
+ * The output produced by a compilation process by one of the CUDA libraries,
+ * including any byproducts.
+ *
+ * @tparam Kind Which language was compiled to produce the result
+ *
+ * @note A failed compilation is also a (useful) kind of compilation output.
+ */
 template <source_kind_t Kind>
 class compilation_output_t;
+
 } // namespace rtc
 
 ///@cond
@@ -41,6 +51,7 @@ class module_t;
 
 namespace module {
 
+/// Build a contextualized module from the results of a successful compilation
 template <source_kind_t Kind>
 inline module_t create(
 	const context_t&                        context,
@@ -49,9 +60,6 @@ inline module_t create(
 
 } // namespace module
 
-/**
- * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library.
- */
 namespace rtc {
 
 namespace program {
@@ -237,6 +245,11 @@ template <> inline status_t<cuda_cpp> destroy_and_return_status<cuda_cpp>(handle
 
 } // namespace program
 
+/**
+ * @namespace compilation_output
+ *
+ * Definitions relating to and supporting the @ref compilation_output_t class
+ */
 namespace compilation_output {
 
 namespace detail_ {
@@ -256,12 +269,14 @@ inline compilation_output_t<Kind> wrap(
 } // namespace compilation_output
 
 /**
- * Wrapper class for the result of an NVRTC compilation (including the program handle) -
- * whether it succeeded or failed due to errors in the program itself.
+ * The result of the compilation of an {@ref rtc::program_t}, whether successful or
+ * failed, with any related byproducts.
+ *
+ * @note This class _may_ own a low-level program handle.
  *
- * @note This class _may_ own an NVRTC low-level program handle.
  * @note If compilation failed due to apriori-invalid arguments - an exception will
- * have been thrown. The only failure this class may represent
+ * have been thrown. A failure indication in this class indicates a program whose
+ * compilation actually _took place_ and ended with a failure.
  */
 template <source_kind_t Kind>
 class compilation_output_base_t {
@@ -271,27 +286,30 @@ class compilation_output_base_t {
 	using status_type = status_t<source_kind>;
 
 public: // getters
+
+	/// @returns `true` if the compilation resulting in this output had succeeded
 	bool succeeded() const { return succeeded_; }
+
+	/// @returns `true` if the compilation resulting in this output had failed
 	bool failed() const { return not succeeded_; }
+
+	/// @returns `true` if the compilation resulting in this output had succeeded, `false` otherwise
 	operator bool() const { return succeeded_; }
 	const ::std::string& program_name() const { return program_name_; }
 	handle_type program_handle() const { return program_handle_; }
 
 public: // non-mutators
 
-	// Unfortunately, C++'s standard string class is very inflexible,
-	// and it is not possible for us to get it to have an appropriately-
-	// sized _uninitialized_ buffer. We will therefore have to use
-	// a clunkier return type.
-	//
-	// ::std::string log() const
-
 	/**
-	 * Obtain a copy of the log of the last compilation
+	 * Write a copy of the program compilation log into a user-provided buffer
 	 *
-	 * @note This will fail if the program has never been compiled.
+     * @param[inout] buffer A writable buffer large enough to contain the compilation log
+     *
+	 * @return the buffer passed in (which has now been overwritten with the log)
+	 *
+	 * @note This will fail if the program has never been compiled, or if the
+	 * buffer is not large enough to hold the complete log (plus nul character).
 	 */
-	///@{
 	span<char> log(span<char> buffer) const
 	{
 		size_t size = program::detail_::get_log_size<source_kind>(program_handle_, program_name_.c_str());
@@ -305,6 +323,13 @@ class compilation_output_base_t {
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the compilation log
+	 *
+	 * @returns an owning container with a nul-terminated copy of the log
+	 *
+	 * @note This will fail if the program has never been compiled.
+	 */
 	unique_span<char> log() const
 	{
 		size_t size = program::detail_::get_log_size<source_kind>(program_handle_, program_name_.c_str());
@@ -317,11 +342,34 @@ class compilation_output_base_t {
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
 #if CUDA_VERSION >= 11010
-	virtual unique_span<char> cubin() const = 0;
+	/**
+	 * Write the CUBIN result of the last compilation into a buffer.
+	 *
+	 * @param[inout] buffer A writable buffer large enough to contain the compiled
+	 *     program's CUBIN code.
+	 * @return The sub-buffer, starting at the beginning of @p buffer, containing
+	 *     exactly the compiled program's CUBIN (i.e. sized down to fit the contents)
+	 *
+	 * @note This will fail if the program has never been compiled; due to
+	 * compilation failure and also due to LTO/linking failure.
+	 */
 	virtual span<char> cubin(span<char> buffer) const = 0;
+
+	/**
+	 * Obtain a copy of the CUBIN code resulting from the program compilation
+	 *
+	 * @returns an owning container with a copy of the CUBIN code
+	 *
+	 * @note This will fail if the program has never been compiled; if the compilation
+	 * target was a virtual architecture (in which case only PTX is available); due to
+	 * compilation failure and also due to LTO/linking failure.
+	 */
+	virtual unique_span<char> cubin() const = 0;
+
+	/// @returns true if the program has been successfully compiled, with the result
+	/// containing CUBIN code.
 	virtual bool has_cubin() const = 0;
 #endif
 
@@ -382,9 +430,17 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	 */
 	///@{
 
+
 	/**
+	 * Write a copy of the PTX resulting from the compilation into a user-provided buffer
+	 *
 	 * @param[inout] buffer A writable buffer large enough to contain the compiled
 	 *     program's PTX code.
+	 * @return The sub-buffer, starting at the beginning of @p buffer, containing
+	 *     exactly the compiled program's PTX (i.e. sized down to fit the contents)
+	 *
+	 * @note This will throw if the program has never been compiled, or if the buffer
+	 * is not large enough to contain the compiled PTX code.
 	 */
 	span<char> ptx(span<char> buffer) const
 	{
@@ -398,6 +454,13 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the PTX resulting from the program compilation
+	 *
+	 * @returns an owning container with a nul-terminated copy of the PTX code
+	 *
+	 * @note This will fail if the program compilation has not produced any PTX
+	 */
 	unique_span<char> ptx() const
 	{
 		size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str());
@@ -410,8 +473,9 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
+	/// @returns true if the program has been successfully compiled, with the result containing
+	/// PTX code
 	bool has_ptx() const
 	{
 		size_t size;
@@ -427,15 +491,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	}
 
 #if CUDA_VERSION >= 11010
-	/**
-	 * Obtain a copy of the CUBIN result of the last compilation.
-	 *
-	 * @note CUBIN output is not available when compiling for a virtual architecture only.
-	 * Also, it may be missing in cases such as compilation failure or link-time
-	 * optimization compilation.
-	 * @note This will fail if the program has never been compiled.
-	 */
-	///@{
 	span<char> cubin(span<char> buffer) const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -456,7 +511,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		program::detail_::get_cubin<source_kind>(result.data(), program_handle_, program_name_.c_str());
 		return result;
 	}
-	///@}
 
 	bool has_cubin() const override
 	{
@@ -467,13 +521,12 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 			+ compilation_output::detail_::identify(*this));
 		return (size > 0);
 	}
-
 #endif
 
 #if CUDA_VERSION >= 11040
 	/**
-	 * Obtain a copy of the LTO IR result of the last compilation - the intermediate
-	 * representation used for link-time optimization
+	 * Write the LTO IR result of the last compilation - the intermediate
+	 * representation used for link-time optimization - into a buffer
 	 *
 	 * @throws ::std::invalid_argument if the supplied buffer is too small to hold
 	 * the program's LTO IR.
@@ -485,7 +538,6 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 	 *
 	 * @note LTO IR was called NVVM in CUDA 11.x .
 	 */
-	/// @{
 	span<char> lto_ir(span<char> buffer) const
 	{
 		size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str());
@@ -498,6 +550,15 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return { buffer.data(), size };
 	}
 
+	/**
+	 * Obtain a copy of the intermediate representation, for LTO purposes (LTO IR) resulting
+	 * from the program compilation
+	 *
+	 * @returns an owning container with a nul-terminated copy of the LTO-IR code
+	 *
+	 * @note This will fail if the program was not compiled successfully with the LTO IR option
+	 * enabled
+	 */
 	unique_span<char> lto_ir() const
 	{
 		size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str());
@@ -510,12 +571,9 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		result[size] = '\0';
 		return result;
 	}
-	/// @}
 
-	/**
-	 * Check whether the compilation also resulted in LTO IR - intermediate representation
-	 * for link-time optimization
-	 */
+	/// @returns true if the program has been successfully compiled, with the result containing
+	/// IR (intermediate representation) code usable for LTO (link-time optimization)
 	bool has_lto_ir() const
 	{
 		size_t size;
@@ -533,14 +591,13 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		}
 		return true;
 	}
-
 #endif
 
 	/**
 	 * Obtain the mangled/lowered form of an expression registered earlier, after
 	 * the compilation.
 	 *
-	 * @param unmangled_name A name of a __global__ or __device__ function or variable.
+	 * @param unmangled_name A name of a `__global__` or `__device__` function or variable.
 	 * @return The mangled name (which can actually be used for invoking kernels,
 	 * moving data etc.). The memory is owned by the NVRTC program and will be
 	 * released when it is destroyed.
@@ -554,6 +611,7 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
 		return result;
 	}
 
+	/// @copydoc get_mangling_of(const char*) const
 	const char* get_mangling_of(const ::std::string& unmangled_name) const
 	{
 		return get_mangling_of(unmangled_name.c_str());
@@ -575,12 +633,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		bool           own_handle);
 
 public: // non-mutators
-	/**
-	 * Obtain a copy of the CUBIN result of the last compilation.
-	 *
-	 * @note This will fail if the program has never been compiled.
-	 */
-	///@{
 	span<char> cubin(span<char> buffer) const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -593,7 +645,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		return { buffer.data(), size };
 	}
 
-public: // non-mutators
 	unique_span<char> cubin() const override
 	{
 		size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
@@ -606,7 +657,6 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
 		result[size] = '\0';
 		return result;
 	}
-	///@}
 
 	bool has_cubin() const override
 	{
@@ -695,6 +745,8 @@ template<> inline module_t create<source_kind_t::ptx>(
 #endif // CUDA_VERSION >= 11010
 
 
+/// Build a module from the results of a successful compilation, in the primary context
+/// of the specified device
 template <source_kind_t Kind>
 inline module_t create(
 	device_t&                               device,
diff --git a/src/cuda/rtc/error.hpp b/src/cuda/rtc/error.hpp
index d880a2b5..8ac427b9 100644
--- a/src/cuda/rtc/error.hpp
+++ b/src/cuda/rtc/error.hpp
@@ -121,7 +121,6 @@ namespace rtc {
 template <source_kind_t Kind>
 class runtime_error : public ::std::runtime_error {
 public:
-	///@cond
 	// TODO: Constructor chaining; and perhaps allow for more construction mechanisms?
 	runtime_error(status_t<Kind> error_code) :
 		::std::runtime_error(describe(error_code)),
@@ -132,7 +131,6 @@ class runtime_error : public ::std::runtime_error {
 		::std::runtime_error(::std::move(what_arg) + ": " + describe(error_code)),
 		code_(error_code)
 	{ }
-	///@endcond
 	runtime_error(status::named_t<Kind> error_code) :
 		runtime_error(static_cast<status_t<Kind>>(error_code)) { }
 	runtime_error(status::named_t<Kind> error_code, const ::std::string& what_arg) :
@@ -190,6 +188,15 @@ inline void throw_if_error(rtc::status_t<Kind> status) noexcept(false)
 	if (is_failure(status)) { throw rtc::runtime_error<Kind>(status); }
 }
 
+/**
+ * Throws a @ref ::cuda::rtc::runtime_error exception if the status is not success
+ *
+ * @note The rationale for this macro is that neither the exception, nor its constructor
+ * arguments, are evaluated on the "happy path"; and that cannot be achieved with a
+ * function - which genertally/typically evaluates its arguments. To guarantee this
+ * lazy evaluation with a function, we would need exception-construction-argument-producing
+ * lambdas, which we would obviously rather avoid.
+ */
 #define throw_if_rtc_error_lazy(Kind, status__, ... ) \
 do { \
 	::cuda::rtc::status_t<Kind> tie_status__ = static_cast<::cuda::rtc::status_t<Kind>>(status__); \
diff --git a/src/cuda/rtc/program.hpp b/src/cuda/rtc/program.hpp
index 323eefd5..820b85aa 100644
--- a/src/cuda/rtc/program.hpp
+++ b/src/cuda/rtc/program.hpp
@@ -23,6 +23,19 @@ namespace program {
 
 namespace detail_ {
 
+/**
+ * Create a new program object from source code
+ *
+ * @tparam Kind We can create a program with any one of the (two...) kinds of supported
+ *     source code
+ * @param program_name arbitrary identifier to recognize the program by; it's suggested
+ *     not to get too crazy
+ * @param program_source The source code of the program, possibly with include directives
+ *     in the case of C++
+ * @param num_headers The number of pairs of header "file" names and header content strings
+ * @param header_sources Pointers to nul-terminated per-header source code
+ * @param header_names Pointers to nul-terminated names of the different headers
+ */
 template <source_kind_t Kind = cuda_cpp>
 inline program::handle_t<Kind> create(
 	const char *program_name,
@@ -53,6 +66,7 @@ template <> inline program::handle_t<ptx> create<ptx>(
 }
 #endif // CUDA_VERSION >= 11010
 
+/// Have NVRTC add the specified global to those accessible/usable after compilation
 inline void register_global(handle_t<cuda_cpp> program_handle, const char *global_to_register)
 {
 	auto status = nvrtcAddNameExpression(program_handle, global_to_register);
@@ -60,6 +74,8 @@ inline void register_global(handle_t<cuda_cpp> program_handle, const char *globa
 		+ " with " + identify<cuda_cpp>(program_handle));
 }
 
+/// Splice multiple raw string options together with a ' ' separator character, and
+/// surrounding each option with double-quotes
 inline ::std::string get_concatenated_options(const const_cstrings_span& raw_options)
 {
 	static ::std::ostringstream oss;
@@ -186,14 +202,22 @@ class base_t {
 	using status_type = status_t<source_kind>;
 
 public: // getters
-
+    /// Getters for the constituent object fields
+    ///@{
 	const ::std::string& name() const { return name_; }
+
+	/// Full source code of the program (possibly with preprocessor directives such as `#include`)
 	const char* source() const { return source_; }
+
+	/// Compilation options to be passed to the JIT compiling library along with the source code
 	const compilation_options_t<Kind>& options() const { return options_; }
 	// TODO: Think of a way to set compilation options without having
 	// to break the statement, e.g. if options had a reflected enum value
 	// or some such arrangement.
+
+	/// Compilation options to be passed to the JIT compiling library along with the source code
 	compilation_options_t<Kind>& options() { return options_; }
+	///@}
 
 public: // constructors and destructor
 	explicit base_t(::std::string name) : name_(::std::move(name)) {};
@@ -225,6 +249,9 @@ class program_t;
  * @note This class is a "reference type", not a "value type". Therefore, making changes
  * to the program is a const-respecting operation on this class.
  *
+ * @note Many of this class' methods could have been placed in the base class, and are
+ * "duplicated" in program_t<ptx> - except that they return the program object itself,
+ * allowing for builder-pattern-like use.
  */
 template <>
 class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
@@ -233,26 +260,50 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 
 public: // getters
 
+	/// Names of the "memoized"/off-file-system headers made available to the program
+	/// (and usable as identifiers for `#include` directives)
 	const_cstrings_span header_names() const
 	{
 		return { headers_.names.data(),headers_.names.size()};
 	}
+
+	/// Sources of the "memoized"/off-file-system headers made available to the program
+	/// (and usable as identifiers for `#include` directives)
+	///
+	/// @note each header source string corresponds to the name of the same index 
+	/// accessible via {@ref header_names()}.
 	const_cstrings_span header_sources() const
 	{
 		return { headers_.sources.data(), headers_.sources.size()};
 	}
+
+	/// @returns the number of memoized/off-the-file-system headers made available
+	/// to the program
 	size_t num_headers() const { return headers_.sources.size(); }
 
 public: // setters - duplicated with PTX programs
 
+	/// Have the compilation produce code for devices with a given compute capability
 	program_t& set_target(device::compute_capability_t target_compute_capability)
 	{
 		options_.set_target(target_compute_capability);
 		return *this;
 	}
+
+	/// Have the compilation produce code for devices with the same compute capability
+	/// as a given device
 	program_t& set_target(const device_t& device) { return set_target(device.compute_capability());}
+
+	/// Have the compilation produce code for devices with the same compute capability
+	/// as the device of a given context
 	program_t& set_target(const context_t& context) { return set_target(context.device()); }
+
+	/// Remove all compute capabilities which were chosen to have code produced for them
+	/// by the compilation
 	program_t& clear_targets() { options_.targets_.clear(); return *this; }
+
+	/// Remove all compute capabilities which were chosen to have code produced for them
+	/// by the compilation
 	template <typename Container>
 	program_t& set_targets(Container target_compute_capabilities)
 	{
@@ -262,12 +313,21 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		}
 		return *this;
 	}
+
+	/// Have the compilation also produce code for devices with a given compute
+	/// capability
 	program_t& add_target(device::compute_capability_t target_compute_capability)
 	{
 		options_.add_target(target_compute_capability);
 		return *this;
 	}
+
+	/// Have the compilation also produce code for devices with the same compute
+	/// capability as a given device
 	void add_target(const device_t& device) { add_target(device.compute_capability()); }
+
+	/// Have the compilation also produce code for devices with the same compute
+	/// capability as the device of a given context
 	void add_target(const context_t& context) { add_target(context.device()); }
 
 	program_t& set_source(const char* source) { source_ = source; return *this; }
@@ -285,7 +345,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 
 protected:
 	template <typename String>
-	static inline void check_string_type()
+	static void check_string_type()
 	{
 		using no_cref_string_type = typename ::std::remove_const<typename ::std::remove_reference<String>::type>::type;
 		static_assert(
@@ -315,6 +375,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 	void add_header_source_(::std::string&& source) = delete;
 
 public: // mutators
+	/**
+	 * Adds another "memoized" header to the program
+	 *
+	 * @param name The header name for use in `#include` directives
+	 * @param source The full source code of the header "file", possibly with its own
+	 *     preprocessor directives (e.g. `#include`).
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename String1, typename String2>
 	program_t& add_header(String1&& name, String2&& source)
 	{
@@ -323,6 +392,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/**
+	 * Adds another "memoized" header to the program
+	 *
+	 * @param name_and_source A pair of strings, one being the name for use in `#include`
+	 *     directives, the other being the full source code of the header "file", possibly
+	 *     with its own preprocessor directives (e.g. `#include`).
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename String1, typename String2>
 	program_t& add_header(const ::std::pair<String1, String2>& name_and_source)
 	{
@@ -331,6 +409,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/// @copydoc add_header<String1, String2>(String1&&, String2&&)
 	template <typename String1, typename String2>
 	program_t& add_header(::std::pair<String1, String2>&& name_and_source)
 	{
@@ -339,6 +418,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return add_header(name_and_source);
 	}
 
+	/**
+	 * Adds multiple "memoized" headers to the program
+	 *
+	 * @param name Names of the headers, for use in `#include` directives
+	 * @param source The full source code of each of the header "file", possibly
+	 *     with their own preprocessor directivess.
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename RangeOfNames, typename RangeOfSources>
 	const program_t& add_headers(
 		RangeOfNames   header_names,
@@ -357,7 +445,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 #ifndef NDEBUG
 		if (new_num_headers > ::std::numeric_limits<int>::max()) {
 			throw ::std::invalid_argument("Cannot use more than "
-										  + ::std::to_string(::std::numeric_limits<int>::max()) + " headers.");
+				+ ::std::to_string(::std::numeric_limits<int>::max()) + " headers.");
 		}
 #endif
 		headers_.names.reserve(new_num_headers);
@@ -371,6 +459,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/**
+	 * Adds multiple "memoized" headers to the program
+	 *
+	 * @param name_and_source_pairs A container of pairs of strings, each being made
+	 *     up of a name for use in `#include` directives, and the full source code
+	 *     of the header "file", possibly with its own preprocessor directives.
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename RangeOfNameAndSourcePairs>
 	program_t& add_headers(RangeOfNameAndSourcePairs&& named_header_pairs)
 	{
@@ -392,6 +489,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/**
+	 * Replaces the set of "memoized" headers used in the program's compilation
+	 *
+	 * @param name Names of the headers, for use in `#include` directives
+	 * @param source The full source code of each of the header "file", possibly
+	 *     with their own preprocessor directivess.
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename RangeOfNames, typename RangeOfSources>
 	const program_t& set_headers(
 		RangeOfNames&&   names,
@@ -401,6 +507,15 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return add_headers(names, sources);
 	}
 
+	/**
+	 * Replaces the set of "memoized" headers used in the program's compilation
+	 *
+	 * @param name_and_source_pairs A container of pairs of strings, each being made
+	 *     up of a name for use in `#include` directives, and the full source code
+	 *     of the header "file", possibly with its own preprocessor directives.
+	 *
+	 * @note "names" with path separators can be used, but are discouraged
+	 */
 	template <typename RangeOfNameAndSourcePairs>
 	program_t& set_headers(RangeOfNameAndSourcePairs&& named_header_pairs)
 	{
@@ -409,6 +524,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/// Removes all "memoized" headers to be used in the program's compilation
 	program_t& clear_headers()
 	{
 		headers_.names.clear();
@@ -416,12 +532,21 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/// Clears any forced values of compilation options, reverting the compilation
+	/// to the default values
 	program_t& clear_options() { options_ = {}; return *this; }
 
 public:
 
 	// TODO: Support specifying all compilation option in a single string and parsing it
 
+	/**
+	 * Compiles the program represented by this object (which, until this point, is
+	 * just a bunch of unrelated sources and options).
+	 *
+	 * @note Carefully examines the @ref compilation_output_t class to understand what
+	 * exactly the compilation produces.
+	 */
 	compilation_output_t<cuda_cpp> compile() const
 	{
 		if ((source_ == nullptr or *source_ == '\0') and options_.preinclude_files.empty()) {
@@ -447,19 +572,19 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 	 * @note The name must continue to exist past the compilation of the program - as it is not copied,
 	 * only referenced
 	 */
-	///@{
 	program_t& add_registered_global(const char* unmangled_name)
 	{
 		globals_to_register_.push_back(unmangled_name);
 		return *this;
 	}
+
+	/// @copydoc add_registered_global(const char*)
 	program_t& add_registered_global(const ::std::string& unmangled_name)
 	{
 		globals_to_register_.push_back(unmangled_name.c_str());
 		return *this;
 	}
 	// TODO: Accept string_view's with C++17
-	///@}
 
 	/**
 	 * @brief Register multiple pre-mangled names of global, to make available for use
@@ -472,7 +597,6 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 	 * program - as they are not copied, only referenced. Thus, as a safety precaution, we
 	 * also assume the container continues to exist
 	 */
-	///@{
 	template <typename Container>
 	program_t& add_registered_globals(const Container& globals_to_register)
 	{
@@ -483,6 +607,7 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 		return *this;
 	}
 
+	/// @copydic add_registered_globals(const Container&)
 	template <typename Container>
 	program_t& add_registered_globals(Container&& globals_to_register)
 	{
@@ -491,7 +616,6 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 			"the possible passing of string-like objects at the end of their lifetime");
 		return add_registered_globals(static_cast<const Container&>(globals_to_register));
 	}
-	///@}
 
 public: // constructors and destructor
 	program_t(::std::string name) : base_t(::std::move(name)) {}
@@ -500,9 +624,10 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 	~program_t() = default;
 
 public: // operators
-
+	///@cond
 	program_t& operator=(const program_t& other) = default;
 	program_t& operator=(program_t&& other) = default;
+	///@nocond
 
 protected: // data members
 	struct {
@@ -514,6 +639,16 @@ class program_t<cuda_cpp> : public program::detail_::base_t<cuda_cpp> {
 
 #if CUDA_VERSION >= 11010
 
+/**
+ * Wrapper class for a CUDA PTX (runtime-compilable) program
+ *
+ * @note This class is a "reference type", not a "value type". Therefore, making changes
+ * to the program is a const-respecting operation on this class.
+ *
+ * @note Many of this class' methods could have been placed in the base class, and are
+ * "duplicated" in program_t<ptx> - except that they return the program object itself,
+ * allowing for builder-pattern-like use.
+ */
 template <>
 class program_t<ptx> : public program::detail_::base_t<ptx> {
 public: // types
@@ -523,14 +658,23 @@ class program_t<ptx> : public program::detail_::base_t<ptx> {
 
 public: // setters - duplicated with CUDA-C++/NVRTC programs
 
+	/// @copydoc program_t<cuda_cpp>::set_target(device::compute_capability_t)
 	program_t& set_target(device::compute_capability_t target_compute_capability)
 	{
 		options_.set_target(target_compute_capability);
 		return *this;
 	}
+
+	/// @copydoc program_t<cuda_cpp>::set_target(const device_t&)
 	program_t& set_target(const device_t& device) { return set_target(device.compute_capability());}
+
+	/// @copydoc program_t<cuda_cpp>::set_target(const context_t&)
 	program_t& set_target(const context_t& context) { return set_target(context.device()); }
+
+	/// @copydoc program_t<cuda_cpp>::clear_targets()
 	program_t& clear_targets() { options_.targets_.clear(); return *this; }
+
+	/// @copydoc program_t<cuda_cpp>::set_targets<Container>(Container)
 	template <typename Container>
 	program_t& set_targets(Container target_compute_capabilities)
 	{
@@ -540,27 +684,40 @@ class program_t<ptx> : public program::detail_::base_t<ptx> {
 		}
 		return *this;
 	}
+
+	/// @copydoc program_t<cuda_cpp>::add_target(device::compute_capability_t)
 	program_t& add_target(device::compute_capability_t target_compute_capability)
 	{
 		options_.add_target(target_compute_capability);
 		return *this;
 	}
+
+	/// @copydoc program_t<cuda_cpp>::clear_targets()
 	void add_target(const device_t& device) { add_target(device.compute_capability()); }
+
+	/// @copydoc program_t<cuda_cpp>::set_targets<Container>(Container)
 	void add_target(const context_t& context) { add_target(context.device()); }
 
-	program_t& set_source(const char* source) { source_ = source; return *this; }
+	/// @copydoc program_t<cuda_cpp>::set_source(char const*)
+	program_t& set_source(char const* source) { source_ = source; return *this; }
+
+	/// @copydoc program_t<cuda_cpp>::set_source(const ::std::string&)
 	program_t& set_source(const ::std::string& source) { source_ = source.c_str(); return *this; }
+
+	/// @copydoc program_t<cuda_cpp>::set_options(compilation_options_t<ptx>)
 	program_t& set_options(compilation_options_t<source_kind> options)
 	{
 		options_ = ::std::move(options);
 		return *this;
 	}
+	/// @copydoc program_t<cuda_cpp>::clear_options()
 	program_t& clear_options() { options_ = {}; return *this; }
 
 public:
 
 	// TODO: Support specifying all compilation option in a single string and parsing it
 
+	/// @copydoc program_t<cuda_cpp>::compile()
 	compilation_output_t<ptx> compile() const
 	{
 		if (source_ == nullptr or *source_ == '\0') {
@@ -574,7 +731,6 @@ class program_t<ptx> : public program::detail_::base_t<ptx> {
 			{option_ptrs.data(), option_ptrs.size()});
 	}
 
-
 public: // constructors and destructor
 	program_t(::std::string name) : parent(std::move(name)) {}
 	program_t(const program_t&) = default;
@@ -583,20 +739,27 @@ class program_t<ptx> : public program::detail_::base_t<ptx> {
 
 public: // operators
 
+	///@cond
 	program_t& operator=(const program_t& other) = default;
 	program_t& operator=(program_t&& other) = default;
+	///@nocond
 }; // class program_t<ptx>
 
 #endif // CUDA_VERSION >= 11010
 
 namespace program {
 
+/**
+ * Create a new (not-yet-compiled) program without setting most of its
+ * constituent fields.
+ */
 template <source_kind_t Kind>
 inline program_t<Kind> create(const char* program_name)
 {
 	return program_t<Kind>(program_name);
 }
 
+/// @copydoc create <Kind>(const char*)
 template <source_kind_t Kind>
 inline program_t<Kind> create(const ::std::string& program_name)
 {
@@ -605,6 +768,12 @@ inline program_t<Kind> create(const ::std::string& program_name)
 
 } // namespace program
 
+/**
+ * @returns all compute capabilities supported as targets by NVRTC and (most likely)
+ * also by the PTX compilation library.
+ *
+ * @note the compute capabilities are returned in ascending order.
+ */
 #if CUDA_VERSION >= 11020
 inline unique_span<device::compute_capability_t>
 supported_targets()
diff --git a/src/cuda/rtc/types.hpp b/src/cuda/rtc/types.hpp
index 1526a4c6..80f0de90 100644
--- a/src/cuda/rtc/types.hpp
+++ b/src/cuda/rtc/types.hpp
@@ -51,13 +51,20 @@ enum source_kind_t {
 // provide a container which may then be resized.
 
 /**
- * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library.
+ * @namespace rtc
+ *
+ * @brief Real-time compilation of programs using the NVIDIA libraries.
  */
 namespace rtc {
 
 /// A span of C-style strings the contents of which must not be modified
 using const_cstrings_span = span<const char* const>;
 
+/**
+ * @namespace program
+ *
+ * Definitions relating to source-code programs to be compiled
+ */
 namespace program {
 
 namespace detail_ {