diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp index 58a9ea49..4519cfd6 100644 --- a/src/cuda/api/array.hpp +++ b/src/cuda/api/array.hpp @@ -270,16 +270,20 @@ array_t wrap( return { device_id, context_handle, handle, dimensions }; } +/// Create a new (typed) CUDA array of the specified dimensions +///@{ +/// @param context ... in which the array is to be created template array_t create( const context_t& context, dimensions_t dimensions); +/// @param device ... in whose primary context the array is to be created template array_t create( const device_t& device, dimensions_t dimensions); - +///@} } // namespace array diff --git a/src/cuda/api/common_ptx_compilation_options.hpp b/src/cuda/api/common_ptx_compilation_options.hpp index 6a92a206..24a9b9e6 100644 --- a/src/cuda/api/common_ptx_compilation_options.hpp +++ b/src/cuda/api/common_ptx_compilation_options.hpp @@ -193,7 +193,7 @@ struct common_ptx_compilation_options_t { bool generate_relocatable_device_code { false }; // What about store caching? -}; +}; // common_ptx_compilation_options_t } // namespace rtc } // namespace cuda diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp index 1933d66e..ca97c5ea 100644 --- a/src/cuda/api/context.hpp +++ b/src/cuda/api/context.hpp @@ -210,6 +210,15 @@ inline context::flags_t get_flags(handle_t handle) } // namespace context +/** + * Waits for all previously-scheduled tasks on all streams (= queues) + * in a CUDA context to conclude, before returning. + * + * Depending on the `host_thread_sync_scheduling_policy_t` set for the + * specified context, the thread calling this method will either yield, + * spin or block until all tasks scheduled previously scheduled on streams + * within this context have concluded. + */ inline void synchronize(const context_t& context); /** @@ -745,10 +754,18 @@ inline handle_t create_and_push( /** * @brief creates a new context on a given device * - * @param device The device on which to create the new stream + * @param device + * The device which the new context will regard * @param sync_scheduling_policy + * Choice of how host threads are to perform synchronization with pending + * actions in streams within this context. See + * @ref host_thread_sync_scheduling_policy_t for a description of these + * choices. * @param keep_larger_local_mem_after_resize - * @return + * If true, larger allocations of global device memory, used by kernels + * requiring a larger amount of local memory, will be kept (so that future + * kernels with such requirements will not trigger a re-allocation). + * * @note Until CUDA 11, there used to also be a flag for enabling/disabling * the ability of mapping pinned host memory to device addresses. However, it was * being ignored since CUDA 3.2 already, with the minimum CUDA version supported @@ -861,6 +878,7 @@ inline context_t get_with_fallback_push() } // namespace current +/// @return true if the context is the primary context of its device bool is_primary(const context_t& context); namespace detail_ { diff --git a/src/cuda/api/detail/unique_span.hpp b/src/cuda/api/detail/unique_span.hpp index 5f18f4b2..7db1e752 100644 --- a/src/cuda/api/detail/unique_span.hpp +++ b/src/cuda/api/detail/unique_span.hpp @@ -37,7 +37,6 @@ namespace cuda { * * @tparam T the type of individual elements in the unique_span */ - template> class unique_span : public ::cuda::span { public: // span types diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index 4dfaf4ad..2fd91dd2 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -38,7 +38,7 @@ class pool_t; * @brief Waits for all previously-scheduled tasks on all streams (= queues) * on a specified device to conclude. * - * Depending on the host_thread_sync_scheduling_policy_t set for this + * Depending on the host_thread_sync_scheduling_policy_t set for the specified * device, the thread calling this method will either yield, spin or block * until all tasks scheduled previously scheduled on this device have been * concluded. @@ -604,11 +604,17 @@ class device_t { set_flags(other_flags | static_cast(new_policy)); } + /// @returns true if the device will keep larger amounts of global device memory allocated + /// for use as local memory, after a kernel was executed which required a larger-than-usual + /// allocation bool keeping_larger_local_mem_after_resize() const { return flags() & CU_CTX_LMEM_RESIZE_TO_MAX; } + /// @brief Instructs the (primary context of) the device to keep larger amounts of global + /// device memory allocated for use as local memory, after a kernel was executed which + /// required a larger-than-usual allocation void keep_larger_local_mem_after_resize(bool keep = true) { auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX; @@ -616,6 +622,9 @@ class device_t { set_flags(new_flags); } + /// @brief Instructs the (primary context of) the device to discard allocations of larger + /// amounts of global device memory which were used by a kernel requiring a larger amount + /// of local memory, and has concluded execution. void dont_keep_larger_local_mem_after_resize() { keep_larger_local_mem_after_resize(false); diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp index d3faaf07..9899afe4 100644 --- a/src/cuda/api/launch_config_builder.hpp +++ b/src/cuda/api/launch_config_builder.hpp @@ -179,6 +179,8 @@ class launch_config_builder_t { } } + /// Use the information specified for the builder to figure out the grid and block + /// dimensions with which the kernel is to be launched grid::composite_dimensions_t get_composite_dimensions() const noexcept(false) { auto result = get_unvalidated_composite_dimensions(); @@ -189,6 +191,10 @@ class launch_config_builder_t { } public: + /// Use the information specified to the builder (and defaults for the unspecified + /// information) to finalize the construction of a kernel launch configuration, + /// which can then be passed along with the kernel to a kernel-launching function, + /// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch launch_configuration_t build() const { auto result = launch_configuration_t{ get_composite_dimensions() }; @@ -392,6 +398,7 @@ class launch_config_builder_t { } + /// Set the dimensions for each block in the intended kernel launch grid launch_config_builder_t& block_dimensions( grid::block_dimension_t x, grid::block_dimension_t y = 1, @@ -400,8 +407,17 @@ class launch_config_builder_t { return block_dimensions(grid::block_dimensions_t{x, y, z}); } + /// Set the block in the intended kernel launch grid to be uni-dimensional + /// with a specified size launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); } + /** + * Set the intended kernel launch grid to have 1D blocks, of the maximum + * length possible given the information specified to the builder. + * + * @note This will fail if neither a kernel nor a device have been chosen + * for the launch. + */ launch_config_builder_t& use_maximum_linear_block() { grid::block_dimension_t max_size; @@ -424,6 +440,16 @@ class launch_config_builder_t { } #if CUDA_VERSION >= 12000 + /** + * Set the dimensions of multi-block clusters within the grid. + * + * @note There is only a small number of possible dimension combinations of clusters; + * and this function does _not_ guarantee to fail immediately if you specify an + * invalid such combination. + * + * @note This setting does not affect the overall dimensions of the grid in terms of + * blocks. + */ launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims) { #ifndef NDEBUG @@ -434,6 +460,9 @@ class launch_config_builder_t { } #endif + /// Set the dimension of the grid for the intended kernel launch, in terms + /// of blocks + ///@{ launch_config_builder_t& grid_dimensions(grid::dimensions_t dims) { #ifndef NDEBUG @@ -447,6 +476,7 @@ class launch_config_builder_t { return *this; } + ///@} launch_config_builder_t& grid_dimensions( grid::dimension_t x, grid::dimension_t y = 1, @@ -455,9 +485,17 @@ class launch_config_builder_t { return grid_dimensions(grid::dimensions_t{x, y, z}); } + /// Set the grid for the intended launch to be one-dimensional, with a specified number + /// of blocks + ///@{ launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); } launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); } + ///@} + + /// Set the overall number of _threads_, in each dimension, of all blocks + /// in the grid of the intended kernel launch + ///@{ launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims) { #ifndef NDEBUG @@ -474,16 +512,30 @@ class launch_config_builder_t { { return overall_dimensions(grid::overall_dimensions_t{x, y, z}); } + ///@} + /// Set the intended launch grid to be linear, with a specified overall number of _threads_ + /// over all (1D) blocks in the grid launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); } + /** + * Set whether or blocks may synchronize with each other or not + * + * @note recall that even "non-cooperative" blocks can still access the same global memory + * locations, and can use atomic operations on such locations for (slow) synchronization. + */ launch_config_builder_t& block_cooperation(bool cooperation) { thread_block_cooperation = cooperation; return *this; } + /// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently + /// (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); } + + /// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will + /// work entirely independently (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); } launch_config_builder_t& dynamic_shared_memory_size( @@ -493,11 +545,18 @@ class launch_config_builder_t { return *this; } + /// Indicate that the intended launch should not allocate any shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. launch_config_builder_t& no_dynamic_shared_memory() { return dynamic_shared_memory_size(memory::shared::size_t(0)); } + /// Indicate that the intended launch should allocate a certain amount of shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. + ///@{ launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size) { #ifndef NDEBUG @@ -512,13 +571,32 @@ class launch_config_builder_t { { return dynamic_shared_memory_size(size); } + ///@} + /** + * Indicate that the intended launch should allocate additional shared + * memory for the kernel to use beyond the static amount necessitated + * by its (compiled) code - with the amount to be determined based on + * the block size + * + * @param shared_mem_size_determiner a function determining the dynamic + * shared memory size given the kernel launch block size + */ launch_config_builder_t& dynamic_shared_memory( kernel::shared_memory_size_determiner_t shared_mem_size_determiner) { return dynamic_shared_memory_size(shared_mem_size_determiner); } + /** + * Indicate that the specified wrapped kernel will be the one launched + * with the configuration to be produced by this object. Such an indication + * provides this object with information about the device and context in + * which the kernel is to be launched, and ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Calling this method obviates a call to the @ref device() method. + */ launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr) { if (device_ and kernel_->device_id() != device_.value()) { @@ -533,6 +611,15 @@ class launch_config_builder_t { return *this; } + /** + * Indicate that the intended kernel launch would occur on (some stream in + * some context on) the specified device. Such an indication provides this + * object with some information regarding ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Do not call both this and the @ref kernel() method; prefer just that one. + */ + ///@{ launch_config_builder_t& device(const device::id_t device_id) { if (kernel_ and kernel_->device_id() != device_id) { @@ -548,7 +635,11 @@ class launch_config_builder_t { { return this->device(device.id()); } + ///@} + /// Clear the association with a specific kernel (which may have been + /// set using the @ref kernel method) + ///@{ launch_config_builder_t& kernel_independent() { kernel_ = nullptr; @@ -559,13 +650,14 @@ class launch_config_builder_t { kernel_ = nullptr; return *this; } + ///@} /** - * @brief THis will use information about the kernel, the already-set block size, + * @brief This will use information about the kernel, the already-set block size, * and the device to create a unidimensional grid of blocks to exactly saturate * the CUDA device's capacity for simultaneous active blocks. * - * @note This will _not_ set the block size - unlike + * @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}. */ launch_config_builder_t& saturate_with_active_blocks() { @@ -584,6 +676,14 @@ class launch_config_builder_t { return *this; } + /** + * @brief This will use information about the kernel and the device to define + * a minimum launch grid which should guarantee maximum occupancy of the GPU's + * multiprocessors. + * + * @note A builder after this call _will_ set the block dimensions - unlike + * {@ref saturate_with_active_blocks()} . + */ launch_config_builder_t& min_params_for_max_occupancy() { if (not (kernel_)) { @@ -600,6 +700,7 @@ class launch_config_builder_t { } }; // launch_config_builder_t +/// A slightly shorter-named construction idiom for @ref launch_config_builder_t inline launch_config_builder_t launch_config_builder() { return {}; } } // namespace cuda diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp index f891f8e0..e213e7ed 100644 --- a/src/cuda/api/launch_configuration.hpp +++ b/src/cuda/api/launch_configuration.hpp @@ -58,7 +58,16 @@ enum class cluster_scheduling_policy_t { }; #endif +/** + * The full set of possible configuration parameters for launching a kernel on a GPU. + * + * @note Consider using a @ref launch_configuration_builder_t to incrementally construct + * these structs. + * + * @note this structure must be constructed with at least the grid and block dimensions. + */ struct launch_configuration_t { + /// Dimensions of the launch grid in blocks, and of the individual blocks in the grid. grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } }; /** @@ -160,6 +169,20 @@ struct launch_configuration_t { constexpr launch_configuration_t(const launch_configuration_t&) = default; constexpr launch_configuration_t(launch_configuration_t&&) = default; + /** + * Constructors corresponding to the CUDA runtime API's triple-chevron launch + * syntax: + * + * my_kernel <<< grid_Dims, block_dims, dynamic_shmem_size, my_stream >>> ( + * arg1, arg2, arg3, etc); + * + * ... where the specified aspects of the launch configuration are the dimensions + * and the dynamic shared memory size. + * + * @note The choices of stream and kernel function are _not_ part of the launch + * configuration. + */ + ///@{ constexpr launch_configuration_t( grid::composite_dimensions_t grid_and_block_dimensions, memory::shared::size_t dynamic_shared_mem = 0u @@ -184,12 +207,14 @@ struct launch_configuration_t { grid::block_dimensions_t(block_dims), dynamic_shared_mem) { } + ///@} CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default; CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default; }; #if __cplusplus < 202002L +///@cond constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept { return @@ -210,6 +235,7 @@ constexpr bool operator!=(const launch_configuration_t lhs, const launch_configu { return not (lhs == rhs); } +///@endcond #endif namespace detail_ { diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp index ef55571e..bd028477 100644 --- a/src/cuda/api/link.hpp +++ b/src/cuda/api/link.hpp @@ -20,14 +20,14 @@ namespace cuda { ///@cond class device_t; - class module_t; - class link_t; ///@endcond +/// Definitions related to CUDA linking-processes, captured by the @ref link_t wrapper class namespace link { +/// Kinds of images which can be used by the linker (some may require driver compilation work) enum class input_kind_t { cubin, /// Compiled device-class-specific device code ptx, /// PTX (microarchitecture-inspecific intermediate representation) @@ -36,9 +36,14 @@ enum class input_kind_t { library, /// An archive of objects files with embedded device code; a `.a` file }; +/// A raw CUDA driver handle for a linking-process using handle_t = CUlinkState; -// TODO: Check if the linking has been completed! +/** + * @brief Wrap an existing CUDA link-process in a @ref link_t wrapper class instance. + * + * @todo : Consider checking if the linking has already been completed! + */ inline link_t wrap( device::id_t device_id, context::handle_t context_handle, @@ -48,21 +53,19 @@ inline link_t wrap( inline link_t create(const void *image, const link::options_t &options); -// TODO: Use a clase-class with C++17 of later, made up of the two classes here +/// Definitions relating to inputs to CUDA linking-processes namespace input { -/** - * A typed, named, image in memory which can be used as an input to a runtime - * CUDA linking process. - */ +/// A typed, named, image in memory which can be used as an input to a runtime CUDA linking-process struct image_t : memory::region_t { - const char *name; - link::input_kind_t type; + const char *name; /// Link images are attached a name when registered in a linking-process + link::input_kind_t type; /// type of contents found in the memory region }; +/// A typed, named, image in a file which can be used as an input to a runtime CUDA linking-process struct file_t { - const char *path; // TODO: Use a proper path in C++14 and later - link::input_kind_t type; + const char *path; + link::input_kind_t type; /// type of contents found in the file }; } // namespace input @@ -115,7 +118,16 @@ class link_t { return memory::region_t{cubin_output_start, cubin_output_size}; } - // TODO: Replace this with methods which take wrapper classes. + /** + * Add another linkable image, from memory, to this linking-process + * + * @param[in] image Memory region containing the image + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ void add(link::input::image_t image, const link::options_t &ptx_compilation_options = {}) const { auto marshalled_options = link::detail_::marshal(ptx_compilation_options); @@ -134,6 +146,17 @@ class link_t { + ::std::to_string(static_cast(image.type)) + " to a link."); } + /** + * Add another linkable image, from a file, to this linking-process + * + * @param[in] file_input Path of the image file to be added + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ + ///@{ void add_file(link::input::file_t file_input, const link::options_t &options) const { auto marshalled_options = link::detail_::marshal(options); @@ -156,6 +179,7 @@ class link_t { return add_file(path.c_str(), file_contents_type); } #endif + ///@} protected: // constructors @@ -219,6 +243,7 @@ class link_t { namespace link { +/// Create a new link-process (before adding any compiled images or or image-files) inline link_t create(const link::options_t &options = link::options_t{}) { handle_t new_link_handle; @@ -241,13 +266,12 @@ inline link_t create(const link::options_t &options = link::options_t{}) do_take_ownership); } -// TODO: Check if the linking has been completed! inline link_t wrap( - device::id_t device_id, - context::handle_t context_handle, - link::handle_t handle, - const link::options_t &options, - bool take_ownership) noexcept + device::id_t device_id, + context::handle_t context_handle, + link::handle_t handle, + const link::options_t & options, + bool take_ownership) noexcept { return link_t{device_id, context_handle, handle, options, take_ownership}; } diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp index a2e17d59..52d98206 100644 --- a/src/cuda/api/link_options.hpp +++ b/src/cuda/api/link_options.hpp @@ -1,7 +1,8 @@ /** * @file * - * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code. + * @brief Definitions and utility functions relating to just-in-time compilation, assembly + * and linking of CUDA code. */ #pragma once #ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ @@ -21,17 +22,37 @@ class module_t; namespace link { +/// Possible strategies for obtaining fully-compiled binary code for a target device +/// when it is not immediately available. enum fallback_strategy_for_binary_code_t { + /// Prefer compiling available PTX code to produce fully-compiled binary code prefer_compiling_ptx = 0, + /// Prefer using existing fully-compiled (binary) code, for a compatible but + /// not identical target device prefer_using_compatible_binary = 1, }; namespace detail_ { +/// The CUDA driver's raw generic JIT-related option type using option_t = CUjit_option; +/** + * Mechanism for finalizing options into a format readily usable by the + * link_t wrapper (and by the `cuLink`- functions - but highly inconvenient + * for inspection and modification. + * + * @note Don't create these yourself unless you have to; use @ref options_t + * instead, and @ref options_t::marshal() when done, for completing the + * linking-process. If you must create them - use `push_back()` method + * repeatedly until done with all options. + */ struct marshalled_options_t { + /// The CUDA driver's expected type for number of link-related options using size_type = unsigned; + + /// The CUDA driver's enum for option identification has this many values - + /// and thus, there is need for no more than this many marshalled options constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS }; protected: @@ -39,8 +60,6 @@ struct marshalled_options_t { ::std::array value_buffer; size_type count_ { 0 }; public: - size_type count() const { return count_; } - void push_back(option_t option) { if (count_ >= max_num_options) { @@ -76,7 +95,12 @@ struct marshalled_options_t { } public: - + /** + * This method (alone) is used to populate this structure. + * + * @note The class is not a standard container, and this method cannot be + * reversed or undone, i.e. there is no `pop_back()` or `pop()`. + */ template void push_back(option_t option, T value) { @@ -85,25 +109,46 @@ struct marshalled_options_t { // Now set value_buffer[count-1]... value_buffer[count_-1] = process_value(value); } + + /// These three methods yield what the CUDA driver actually expects: + /// Two matching raw buffers and their count of elements + ///@{ const option_t* options() const { return option_buffer.data(); } const void * const * values() const { return value_buffer.data(); } + size_type count() const { return count_; } + ///@} }; } // namespace detail_ +/** + * A convenience class for holding, setting and inspecting options for a CUDA binary code + * linking process - which may also involve PTX compilation. + * + * @note This structure does not let you set those options which the CUDA driver documentation + * describes as having internal purposes only. + */ struct options_t final : public rtc::common_ptx_compilation_options_t { + /// options related to logging the link-process struct { + /// Non-error information regarding the logging process (i.e. its "standard output" stream) optional> info; + + /// Information regarding errors in the logging process (i.e. its "standard error" stream) optional> error; + + /// Control whether the info and error logging will be verbose bool verbose; } logs; - // Note: When this is true, the specific_target of the base class - // is overridden + /// Instead of using explicitly-specified binary target, from + /// @ref common_ptx_compilation_options_t::specific_target - use the device of the current CUDA + /// context as the target for binary generation bool obtain_target_from_cuda_context { true }; - /// fallback behavior if a (matching cubin???) is not found + /// Possible strategy for obtaining fully-compiled binary code when it is not + /// simply available in the input to the link-process optional fallback_strategy_for_binary_code; // Ignoring the "internal purposes only" options; @@ -118,6 +163,8 @@ struct options_t final : public rtc::common_ptx_compilation_options_t { namespace detail_ { +/// Construct a easily-driver-usable link-process options structure from +/// a more user-friendly `options_t` structure. inline marshalled_options_t marshal(const options_t& link_options) { marshalled_options_t marshalled{}; diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index ce72ecae..24f286a5 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -85,7 +85,7 @@ enum cpu_write_combining : bool { }; /** - * @brief options accepted by CUDA's allocator of memory with a host-side aspect + * options accepted by CUDA's allocator of memory with a host-side aspect * (host-only or managed memory). */ struct allocation_options { @@ -106,6 +106,7 @@ inline unsigned make_cuda_host_alloc_flags(allocation_options options) /** * @namespace mapped + * * Memory regions appearing in both on the host-side and device-side address * spaces with the regions in both spaces mapped to each other (i.e. guaranteed * to have the same contents on access up to synchronization details). Consult the @@ -124,7 +125,7 @@ struct span_pair_t { }; /** - * @brief A pair of memory regions, one in system (=host) memory and one on a + * A pair of memory regions, one in system (=host) memory and one on a * CUDA device's memory - mapped to each other * * @note this is the mapped-pair equivalent of a `void *`; it is not a @@ -143,9 +144,7 @@ struct region_pair_t { } // namespace mapped -/** - * @brief CUDA-Device-global memory on a single device (not accessible from the host) - */ +///CUDA-Device-global memory on a single device (not accessible from the host) namespace device { namespace detail_ { @@ -325,7 +324,7 @@ struct deleter { /** - * @brief Sets consecutive elements of a region of memory to a fixed + * Sets consecutive elements of a region of memory to a fixed * value of some width * * @note A generalization of `set()`, for different-size units. @@ -339,7 +338,7 @@ template void typed_set(T* start, const T& value, size_t num_elements); /** - * @brief Sets all bytes in a region of memory to a fixed value + * Sets all bytes in a region of memory to a fixed value * * @note The equivalent of @ref ::std::memset for CUDA device-side memory * @@ -366,7 +365,7 @@ inline void set(region_t region, int byte_value) ///@} /** - * @brief Sets all bytes in a region of memory to 0 (zero) + * Sets all bytes in a region of memory to 0 (zero) */ ///@{ /** @@ -389,7 +388,7 @@ inline void zero(region_t region) ///@} /** - * @brief Sets all bytes of a single pointed-to value to 0 + * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a value of a certain type, in a CUDA device's * global memory @@ -558,7 +557,7 @@ inline void copy(region_t destination, void* source) ///@} /** - * @brief Sets a number of bytes in memory to a fixed value + * Sets a number of bytes in memory to a fixed value * * @note The equivalent of @ref ::std::memset - for any and all CUDA-related * memory spaces @@ -571,7 +570,7 @@ inline void copy(region_t destination, void* source) void set(void* ptr, int byte_value, size_t num_bytes); /** - * @brief Sets all bytes in a region of memory to a fixed value + * Sets all bytes in a region of memory to a fixed value * * @note The equivalent of @ref ::std::memset - for any and all CUDA-related * memory spaces @@ -586,7 +585,7 @@ inline void set(region_t region, int byte_value) } /** - * @brief Sets all bytes in a region of memory to 0 (zero) + * Sets all bytes in a region of memory to 0 (zero) */ ///@{ /** @@ -610,7 +609,7 @@ inline void zero(void* ptr, size_t num_bytes) ///@} /** - * @brief Sets all bytes of a single pointed-to value to 0 + * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a single element of a certain type, which may * be in host-side memory, global CUDA-device-side memory or CUDA-managed @@ -662,7 +661,7 @@ status_t multidim_copy(copy_parameters_t params) } // namespace detail_ /** - * @brief An almost-generalized-case memory copy, taking a rather complex structure of + * An almost-generalized-case memory copy, taking a rather complex structure of * copy parameters - wrapping the CUDA driver's own most-generalized-case copy * * @tparam NumDimensions The number of dimensions of the parameter structure. @@ -1216,8 +1215,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han /** - * @brief Sets consecutive elements of a region of memory to a fixed - * value of some width + * Sets consecutive elements of a region of memory to a fixed value of some width * * @note A generalization of `async::set()`, for different-size units. * @@ -1263,7 +1261,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s void zero(void* start, size_t num_bytes, const stream_t& stream); /** - * @brief Asynchronously sets all bytes of a single pointed-to value + * Asynchronously sets all bytes of a single pointed-to value * to 0 (zero). * * @note asynchronous version of @ref memory::zero(T*) @@ -1440,6 +1438,7 @@ inline void copy( /** * @namespace host + * * Host-side (= system) memory which is "pinned", i.e. resides in * a fixed physical location - and allocated by the CUDA driver. */ @@ -1459,7 +1458,7 @@ inline region_t allocate( /** - * allocate pinned host memory + * Allocates pinned host memory * * @note "pinned" memory is allocated in contiguous physical ram * addresses, making it possible to copy to and from it to the the @@ -1469,25 +1468,17 @@ inline region_t allocate( * * @throws cuda::runtime_error if allocation fails for any reason * - * @param context * @param size_in_bytes the amount of memory to allocate, in bytes * @param options * options to pass to the cuda host-side memory allocator; see * {@ref memory::allocation_options}. * @return a pointer to the allocated stretch of memory */ -///@{ - -inline region_t allocate( - const context_t& context, - size_t size_in_bytes, - allocation_options options); - -region_t allocate( - size_t size_in_bytes, - allocation_options options); +region_t allocate(size_t size_in_bytes, allocation_options options); /** + * @copydoc allocate(size_t, allocation_options) + * * @param portability * whether or not the allocated region can be used in different * CUDA contexts. @@ -1504,16 +1495,18 @@ inline region_t allocate( return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } ); } +/// @copydoc allocate(size_t, portability_across_contexts, cpu_write_combining) inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc) { return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} ); } -///@} - /** - * Free a region of pinned host memory which was allocated with one of the pinned host + * Frees a region of pinned host memory which was allocated with one of the pinned host * memory allocation functions. + * + * @note The address provided must be the _beginning_ of the region of allocated memory; + * and the entire region is freed (i.e. the region size is known to/determined by the driver) */ inline void free(void* host_ptr) { @@ -1526,6 +1519,11 @@ inline void free(void* host_ptr) throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr)); } +/** + * @copybrief free(void*) + * + * @param region The region of memory to free + */ inline void free(region_t region) { return free(region.data()); } namespace detail_ { @@ -1537,9 +1535,8 @@ struct deleter { void operator()(void* ptr) const { cuda::memory::host::free(ptr); } }; - /** - * @brief Makes a preallocated memory region behave as though it were allocated with @ref host::allocate. + * Makes a pre-allocated memory region behave as though it were allocated with @ref host::allocate. * * Page-locks the memory range specified by ptr and size and maps it for the device(s) as specified by * flags. This memory range also is added to the same tracking mechanism as cuMemAllocHost() to @@ -1669,7 +1666,7 @@ inline void deregister(const_region_t region) } /** - * @brief Sets all bytes in a stretch of host-side memory to a single value + * Sets all bytes in a stretch of host-side memory to a single value * * @note a wrapper for @ref ::std::memset * @@ -1703,22 +1700,6 @@ inline void zero(T* ptr) } // namespace host -/** - * This type of memory, also known as _unified_ memory, appears within - * a unified, all-system address space - and is used with the same - * address range on the host and on all relevant CUDA devices on a - * system. It is paged, so that it may exceed the physical size of - * a CUDA device's global memory. The CUDA driver takes care of - * "swapping" pages "out" from a device to host memory or "swapping" - * them back "in", as well as of propagation of changes between - * devices and host-memory. - * - * @note For more details, see - * - * Unified Memory for CUDA Beginners on the - * Parallel4All blog. - * - */ namespace managed { namespace detail_ { @@ -1899,7 +1880,7 @@ inline region_t allocate( } // namespace detail_ /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on CUDA devices. * * @param context the initial context which is likely to access the managed @@ -1917,7 +1898,7 @@ inline region_t allocate( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on CUDA devices * * @param device the initial device which is likely to access the managed @@ -1935,7 +1916,7 @@ inline region_t allocate( initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); /** - * @brief Allocate a a region of managed memory, accessible with the same + * Allocate a a region of managed memory, accessible with the same * address on the host and on all CUDA devices. * * @note While the allocated memory should be available universally, the @@ -2007,7 +1988,7 @@ inline void prefetch( } // namespace detail_ /** - * @brief Prefetches a region of managed memory to a specific device, so + * Prefetches a region of managed memory to a specific device, so * it can later be used there without waiting for I/O from the host or other * devices. */ @@ -2017,7 +1998,7 @@ void prefetch( const stream_t& stream); /** - * @brief Prefetches a region of managed memory into host memory. It can + * Prefetches a region of managed memory into host memory. It can * later be used there without waiting for I/O from any of the CUDA devices. */ void prefetch_to_host( @@ -2194,6 +2175,7 @@ inline bool is_part_of_a_region_pair(const void* ptr) namespace device { +/// A unique span of device-global memory template using unique_span = cuda::unique_span; @@ -2212,25 +2194,39 @@ unique_span make_unique_span(const context::handle_t context_handle, size_t s } // namespace detail_ /** - * @brief Create a variant of ::std::unique_pointer for an array in - * device-global memory. + * Allocate memory for a consecutive sequence of typed elements in device-global memory. * - * @note CUDA's runtime API always has a current device; but - - * there is not necessary a current context; so a primary context - * for a device may be created through this call. + * @tparam T type of the individual elements in the allocated sequence * - * @tparam T an array type; _not_ the type of individual elements + * @param context The CUDA device context in which to make the allocation. + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said * - * @param context The CUDA device context in which to make the - * allocation. - * @param num_elements the number of elements to allocate + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. * - * @return an ::std::unique_ptr pointing to the constructed T array -*/ + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user beware. + */ template unique_span make_unique_span(const context_t& context, size_t size); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @param device The CUDA device in whose primary context to make the allocation. + */ template unique_span make_unique_span(const device_t& device, size_t size); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @note The current device's primary context will be used (_not_ the + * current context). + */ template unique_span make_unique_span(size_t size); @@ -2243,7 +2239,7 @@ inline device::unique_span make_unique_span(const context_t& context, size_t return device::make_unique_span(context, num_elements); } -/// See @ref `device::make_unique_span(const device_t& device, size_t num_elements)` +/// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)` template inline device::unique_span make_unique_span(const device_t& device, size_t num_elements) { @@ -2252,9 +2248,29 @@ inline device::unique_span make_unique_span(const device_t& device, size_t nu namespace host { +/// A unique span of CUDA-driver-allocated, pinned host (=system) memory template using unique_span = cuda::unique_span; +/** + * Allocate memory for a consecutive sequence of typed elements in system + * (host-side) memory. + * + * @tparam T type of the individual elements in the allocated sequence + * + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said + * memory upon destruction) + * + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. + * + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user + * beware, especially since this is host-side memory. + */ template unique_span make_unique_span(size_t size) { @@ -2265,6 +2281,7 @@ unique_span make_unique_span(size_t size) namespace managed { +/// A unique span of CUDA-driver-allocated managed memory template using unique_span = cuda::unique_span; @@ -2282,16 +2299,48 @@ unique_span make_unique_span( } // namespace detail_ +/** + * Allocate memory for a consecutive sequence of typed elements in system + * (host-side) memory. + * + * @tparam T type of the individual elements in the allocated sequence + * + * @param size the number of elements to allocate + * @return A @ref unique_span which owns the allocated memory (and will release said + * memory upon destruction) + * + * @note This function is somewhat similar to ::std:: make_unique_for_overwrite(), except + * that the returned value is not "just" a unique pointer, but also has a size. It is also + * similar to {@ref cuda::device::make_unique_region}, except that the allocation is + * conceived as typed elements. + * + * @note Typically, this is used for trivially-constructible elements, for which reason the + * non-construction of individual elements should not pose a problem. But - let the user + * beware, especially since this is host-side memory. + */ template unique_span make_unique_span( const context_t& context, size_t size, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @param device The CUDA device in whose primary context to make the allocation. + */ template unique_span make_unique_span( const device_t& device, size_t size, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_span(const context_t&, size_t) + * + * @note The current device's primary context will be used (_not_ the + * current context). + */ template unique_span make_unique_span( size_t size, @@ -2302,6 +2351,7 @@ unique_span make_unique_span( } // namespace memory namespace symbol { + /** * Locates a CUDA symbol in global or constant device memory * diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp index 6d04f547..1913ac0d 100644 --- a/src/cuda/api/miscellany.hpp +++ b/src/cuda/api/miscellany.hpp @@ -30,6 +30,13 @@ inline void initialize_driver() throw_if_error_lazy(status, "Failed initializing the CUDA driver"); } +/** + * A mechanism for ensuring a @ref cuInit() call has been made, to use before making + * any other driver API calls. + * + * @note differs from simply calling `initialize_driver()` in that repeated calls + * from the same thread will avoid additional @ref cuInit() call. + */ inline void ensure_driver_is_initialized() { thread_local bool driver_known_to_be_initialized{false}; diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp index f69315e4..b06d694c 100644 --- a/src/cuda/api/module.hpp +++ b/src/cuda/api/module.hpp @@ -30,10 +30,13 @@ class kernel_t; namespace module { +// The CUDA driver's raw handle for modules using handle_t = CUmodule; namespace detail_ { +/// Construct a module proxy object - for an existing module - from the class' +/// constituent fields inline module_t wrap( device::id_t device_id, context::handle_t context_handle, @@ -70,6 +73,8 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i * be loaded (and in which the module contents may be used) * @param[in] module_data the opaque, raw binary data for the module - in a contiguous container * such as a span, a cuda::unique_span etc.. + * @param link_options Potential options for the PTX compilation and linking of the compiled + * device-side code. */ ///@{ template = 201703L +/// @copydoc load_from_file(device_t, const char*) inline module_t load_from_file( const device_t& device, const ::std::filesystem::path& path) @@ -298,6 +337,7 @@ inline module_t load_from_file( return load_from_file(device, path.c_str()); } +/// @copydoc load_from_file(const char*) inline module_t load_from_file( const ::std::filesystem::path& path) { @@ -320,23 +360,19 @@ inline module_t wrap( return module_t{device_id, context_handle, module_handle, take_ownership, hold_pc_refcount_unit}; } -/* -template -module_t create(const context_t& context, const void* module_data, Creator creator_function); -*/ - /** * Creates a new module in a context using raw compiled code * * @param context The module will exist within this GPU context, i.e. the globals (functions, * variable) of the module would be usable within that constant. * @param module_data The raw compiled code for the module. - * @param link_options Potential options for the PTX compilation and device linking of the code. + * @param link_options Potential options for the PTX compilation and linking of the compiled + * device-side code. */ -///@{ module_t create(const context_t& context, const void* module_data, const link::options_t& link_options); + +/// @copydoc create(const context_t&, const void*, const link::options_t&) module_t create(const context_t& context, const void* module_data); -///@} inline void destroy(handle_t handle, context::handle_t context_handle, device::id_t device_id) { @@ -362,7 +398,17 @@ inline device::primary_context_t get_context_for(device_t& locus); } // namespace detail_ -// Note: The following may create the primary context of a device! +/** + * Create a new module - in a specified context or in a device's primary context, + * using raw module data in memory. + * + * @tparam Locus Either a @ref cuda::device_t or a {@ref cuda::context_t}. + * @tparam ContiguousContainer A span, a vector, a unique_span, or similar type + * @param locus Where the new module should be created + * @param module_data The raw data for the module in locus-accessible memory. + * + * @note This function may create/allocate resources for the primary context of a device! + */ template ::value, bool>> module_t create( @@ -373,6 +419,14 @@ module_t create( return detail_::create(context, module_data.data()); } +/** + * @copydoc create(Locus&&, ContiguousContainer) + * + * @param link_options Options for PTX compilation and for linking the module data, + * eventually. + * + * @return + */ // Note: The following may create the primary context of a device! template ::value, bool>> diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp index 34fb9c5f..3416def8 100644 --- a/src/cuda/api/multi_wrapper_impls/unique_region.hpp +++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp @@ -72,6 +72,14 @@ inline unique_region make_unique_region(size_t num_bytes) namespace managed { +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @param context A context of possible single-device-visibility + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( const context_t& context, size_t num_bytes, @@ -81,6 +89,14 @@ inline unique_region make_unique_region( return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) }; } +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @param context A context of possible single-device-visibility + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( const device_t& device, size_t num_bytes, diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp index c2907c63..0c0ee18b 100644 --- a/src/cuda/api/multi_wrapper_impls/unique_span.hpp +++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp @@ -28,7 +28,7 @@ unique_span make_unique_span(const context_t& context, size_t num_elements) } /** - * @brief Create a variant of ::std::unique_pointer for an array in + * @brief Allocate (but do) * device-global memory * * @tparam T an array type; _not_ the type of individual elements diff --git a/src/cuda/api/pci_id.hpp b/src/cuda/api/pci_id.hpp index 8c8e56ae..9518e941 100644 --- a/src/cuda/api/pci_id.hpp +++ b/src/cuda/api/pci_id.hpp @@ -53,6 +53,8 @@ struct pci_location_t { * and any of them can be used. */ static pci_location_t parse(const ::std::string& id_str); + + /// @copydoc parse(const ::std::string& id_str) static pci_location_t parse(const char* id_str); }; diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp index 68c718b1..cc7c3f93 100644 --- a/src/cuda/api/peer_to_peer.hpp +++ b/src/cuda/api/peer_to_peer.hpp @@ -14,17 +14,31 @@ namespace cuda { namespace device { +/** + * @namespace peer_to_peer + * + * API functions and definitions relating to communications among peer CUDA GPU devices + * on the same system. + */ namespace peer_to_peer { -// Aliases for all CUDA device attributes +/// Aliases for CUDA driver GPU attribute codes +///@{ + +/// A relative value indicating the performance of the link between two devices +constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; + +/// 1 if access is supported, 0 otherwise +constexpr const attribute_t access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; + +/// 1 if the first device can perform native atomic operations on the second device, 0 otherwise +constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; -constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices -constexpr const attribute_t access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise -constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise #if CUDA_VERSION >= 10000 -constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise +/// 1 if special array interpolatory access operations are supported across the link, 0 otherwise +constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; #endif - +///@} namespace detail_ { /** @@ -94,6 +108,12 @@ void disable_access_to(const context_t &peer_context); } // namespace current +/** + * @namespace peer_to_peer + * + * API functions and definitions relating to communications among "peer" contexts on + * the same system, which may possibly regard different CUDA devices. + */ namespace peer_to_peer { namespace detail_ { diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp index 2e573b72..1688b8d3 100644 --- a/src/cuda/api/pointer.hpp +++ b/src/cuda/api/pointer.hpp @@ -120,7 +120,7 @@ inline memory::type_t type_of(const void* ptr) /// Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated /// (e.g. being the result of an allocation or mapping in that context) -inline context_t context_of(const void* ptr); +context_t context_of(void const* ptr); /** * A convenience wrapper around a raw pointer "known" to the CUDA runtime diff --git a/src/cuda/api/primary_context.hpp b/src/cuda/api/primary_context.hpp index db59acc5..4bbaff92 100644 --- a/src/cuda/api/primary_context.hpp +++ b/src/cuda/api/primary_context.hpp @@ -75,6 +75,12 @@ inline void increase_refcount(device::id_t device_id) } // namespace detail_ +/** + * @returns true if the device's primary context is active (i.e. has resources allocated for it), + * which implies we are holding a refcount unit for it somewhere. + * + * @note recall a primary context being active does not mean that it is the _current_ context + */ inline bool is_active(const device_t& device); /** @@ -140,6 +146,9 @@ class primary_context_t : public context_t { public: + /// @return a stream object for the default-ID stream of the device, which + /// is pre-created and on which actions are scheduled when the runtime API + /// is used and no stream is specified. stream_t default_stream() const noexcept; public: // friendship @@ -280,6 +289,7 @@ inline bool is_current(device::id_t device_id) } // namespace detail +/// @return true if the current context is its device's primary context inline bool is_current() { auto device_id = context::current::detail_::get_device_id(); diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index e9896a9a..ab57b314 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -50,6 +50,11 @@ enum : bool { nonblocking = async, }; +/** + * Kinds of conditions to apply to a value in GPU global memory + * when waiting on that value, i.e. on what condition to stop + * waiting. + */ enum wait_condition_t : unsigned { greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ, geq = CU_STREAM_WAIT_VALUE_GEQ, @@ -173,6 +178,9 @@ void enqueue_function_call(const stream_t& stream, Function function, void * arg * the current context and outlasting it. When set to `true`, * the proxy class will act as it does usually, destroying the stream * when being destructed itself. + * @param hold_pc_refcount_unit when the stream's context is a device's primary + * context, this controls whether that context must be kept active while the + * stream continues to exist * @return an instance of the stream proxy class, with the specified * device-stream combination. */ @@ -255,6 +263,8 @@ class stream_t { return flags & CU_STREAM_NON_BLOCKING; } + /// @returns the execution priority of a tasks on this stream (relative to other + /// tasks in other streams on the same device stream::priority_t priority() const { int the_priority; @@ -321,8 +331,21 @@ class stream_t { const stream_t& associated_stream; public: + ///@cond enqueue_t(const stream_t& stream) : associated_stream(stream) {} + ///@nocond + /** + * Schedule a kernel launch on the associated stream + * + * @param kernel A wrapper around the kernel to launch + * @param launch_configuration A description of how to launch the kernel (e.g. + * block and grid dimensions). + * @param parameters to arguments to be passed to the kernel for this launch + * + * @note This function is cognizant of the types of all arguments passed to it; + * for a type-erased version, see @ref type_erased_kernel_launch() + */ template void kernel_launch( const KernelFunction& kernel_function, @@ -336,6 +359,19 @@ class stream_t { ::std::forward(parameters)...); } + /** + * Schedule a kernel launch on the associated stream + * + * @param kernel A wrapper around the kernel to launch + * @param launch_configuration A description of how to launch the kernel (e.g. + * block and grid dimensions). + * @param marshalled_arguments Pointers to arguments to be passed to the kernel + * for this launch + * + * @note This signature does not require any type information regarding the kernel + * function type; see @ref kernel_launch() for a type-observing version of the + * same schedulign operation. + */ void type_erased_kernel_launch( const kernel_t& kernel, launch_configuration_t launch_configuration, @@ -345,21 +381,14 @@ class stream_t { } /** - * Have the CUDA device perform an I/O operation between two specified - * memory regions (on or off the actual device) + * Copy operations * + * The source and destination memory regions may be anywhere the CUDA driver can + * map (e.g. the device's global memory, host/system memory, the global memory of + * another device, constant memory etc.) */ - ///@{ - /** - * @param destination destination region into which to copy. May be - * anywhere in which memory can be mapped to the device's memory space (e.g. - * the device's global memory, host memory or the global memory of another device) - * @param source destination region from which to copy. May be - * anywhere in which memory can be mapped to the device's memory space (e.g. - * the device's global memory, host memory or the global memory of another device) - * @param num_bytes size of the region to copy - **/ + /// Schedule a copy of one region of memory to another void copy(void *destination, const void *source, size_t num_bytes) const { // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy; @@ -368,6 +397,7 @@ class stream_t { memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_); } + /// @copybrief copy(void *, const void *, size_t) const void copy(void* destination, memory::const_region_t source, size_t num_bytes) const { #ifndef NDEBUG @@ -378,16 +408,23 @@ class stream_t { copy(destination, source.start(), num_bytes); } + /** + * @copybrief copy(void *, const void *, size_t) const + * + * @note @p num_bytes may be smaller than the sizes of any of the regions + */ void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const { copy(destination.start(), source, num_bytes); } + /// @copybrief copy(void *, const void *, size_t) const void copy(memory::region_t destination, memory::const_region_t source) const { copy(destination, source, source.size()); } + /// @copybrief copy(void *, const void *, size_t) const void copy(void* destination, memory::const_region_t source) const { copy(destination, source, source.size()); @@ -398,9 +435,9 @@ class stream_t { * Set all bytes of a certain region in device memory (or unified memory, * but using the CUDA device to do it) to a single fixed value. * - * @param destination Beginning of the region to fill + * @param start Beginning of the region to fill * @param byte_value the value with which to fill the memory region bytes - * @param num_bytes size of the region to fill + * @param num_bytes size in bytes of the region to fill */ void memset(void *start, int byte_value, size_t num_bytes) const { @@ -409,6 +446,7 @@ class stream_t { memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); } + /// @copydoc memset(void *, int, size_t) const void memset(memory::region_t region, int byte_value) const { memset(region.data(), byte_value, region.size()); @@ -422,7 +460,7 @@ class stream_t { * API call for setting to zero; does that mean there are special facilities * for zero'ing memory faster? Who knows. * - * @param destination Beginning of the region to fill + * @param start Beginning of the region to fill * @param num_bytes size of the region to fill */ void memzero(void *start, size_t num_bytes) const @@ -431,6 +469,9 @@ class stream_t { memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_); } + /** + * @copydoc memzero(void *, size_t) const + */ void memzero(memory::region_t region) const { memzero(region.data(), region.size()); @@ -492,6 +533,7 @@ class stream_t { } public: + /// Enqueues a host-invokable object, typically a function or closure object call. template void host_invokable(Invokable& invokable) const { @@ -529,13 +571,16 @@ class stream_t { memory::device::async::free(associated_stream, region); } #endif - ///@{ /** * Sets the attachment of a region of managed memory (i.e. in the address space visible * on all CUDA devices and the host) in one of several supported attachment modes. * - * The attachmentis actually a commitment vis-a-vis the CUDA driver and the GPU itself + * @param managed_region_start a pointer to the beginning of the managed memory region. + * This cannot be a pointer to anywhere in the middle of an allocated region - you must + * pass whatever @ref cuda::memory::managed::allocate() returned. + * + * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself * that it doesn't need to worry about accesses to this memory from devices other than * its object of attachment, so that the driver can optimize scheduling accordingly. * @@ -547,12 +592,6 @@ class stream_t { * the attachment goes into effect (some time after) previous scheduled actions have * concluded. */ - ///@{ - /** - * @param managed_region_start a pointer to the beginning of the managed memory region. - * This cannot be a pointer to anywhere in the middle of an allocated region - you must - * pass whatever @ref cuda::memory::managed::allocate() returned. - */ void attach_managed_region( const void* managed_region_start, memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const @@ -572,8 +611,23 @@ class stream_t { } /** - * @param region the managed memory region to attach; it cannot be a sub-region - - * you must pass whatever @ref cuda::memory::managed::allocate() returned. + * @copybrief attach_managed_region(const void*, memory::managed::attachment_t) const + * + * @param region the entire managed memory region; note this must not be a sub-region; + * you must pass whatever the CUDA memory allocation or construction code provided + * you with, in full. + * + * The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself + * that it doesn't need to worry about accesses to this memory from devices other than + * its object of attachment, so that the driver can optimize scheduling accordingly. + * + * @note by default, the memory region is attached to this specific stream on its + * specific device. In this case, the host will be allowed to read from this memory + * region whenever no kernels are pending on this stream. + * + * @note Attachment happens asynchronously, as an operation on this stream, i.e. + * the attachment goes into effect (some time after) previous scheduled actions have + * concluded. */ void attach_managed_region( memory::region_t region, @@ -581,8 +635,6 @@ class stream_t { { attach_managed_region(region.start(), attachment); } - ///@} - /** * Will pause all further activity on the stream until the specified event has @@ -862,16 +914,21 @@ class stream_t { // it must release its refcount unit on destruction public: // data members - which only exist in lieu of namespaces + + /// This data member is a gadget for use instead of a "class-local" namespace; + /// we do not need it as a distinct object const enqueue_t enqueue { *this }; // The use of *this here is safe, since enqueue_t doesn't do anything with it // on its own. Any use of enqueue only happens through, well, *this - and // after construction. }; +///@cond inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept { return not (lhs == rhs); } +///@nocond namespace stream { @@ -972,16 +1029,7 @@ void enqueue_function_call(const stream_t& stream, Function function, void* argu * for execution scheduling; lower numbers represent higher properties; * each device has a range of priorities, which can be obtained using * @ref device_t::stream_priority_range() - * @param hold_pc_refcount_unit when the event's context is a device's primary - * context, this controls whether that context must be kept active while the - * event continues to exist - * @return The newly-created stream - */ -///@{ - -/** - * @brief Create a new stream (= queue) in the primary execution context - * of a CUDA device. + * @return The newly-created stream */ stream_t create( const device_t& device, @@ -992,6 +1040,16 @@ stream_t create( * @brief Create a new stream (= queue) in a CUDA execution context. * * @param context the execution context in which to create the stream + * @param synchronizes_with_default_stream if true, no work on this stream + * will execute concurrently with work from the default stream (stream 0) + * @param priority priority of tasks on the stream, relative to other streams, + * for execution scheduling; lower numbers represent higher properties; + * each device has a range of priorities, which can be obtained using + * @ref device_t::stream_priority_range() + * @param hold_pc_refcount_unit when the stream's context is a device's primary + * context, this controls whether that context must be kept active while the + * steam continues to exist + * @return The newly-created stream */ stream_t create( const context_t& context, @@ -1002,7 +1060,16 @@ stream_t create( } // namespace stream -inline void synchronize(const stream_t& stream) +/** + * Waits for all previously-scheduled tasks on a given stream to conclude, + * before returning. + * + * Depending on the `host_thread_sync_scheduling_policy_t` set for the + * specified stream, the thread calling this method will either yield, + * spin or block until all tasks scheduled previously scheduled on the + * stream have concluded. + */ + inline void synchronize(const stream_t& stream) { // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to, // and not have trouble acting on a stream in another context - it balks at doing so under diff --git a/src/cuda/api/texture_view.hpp b/src/cuda/api/texture_view.hpp index c4456948..34a31a47 100644 --- a/src/cuda/api/texture_view.hpp +++ b/src/cuda/api/texture_view.hpp @@ -21,6 +21,7 @@ class texture_view; namespace texture { +/// The CUDA driver's raw, opaque handle for texture objects using raw_handle_t = CUtexObject; /** @@ -111,7 +112,6 @@ class texture_view { other.owning_ = false; }; - template texture_view( const cuda::array_t& arr, @@ -160,7 +160,10 @@ class texture_view { public: // non-mutating getters + /// @returns A non-owning proxy object for the CUDA context in which this texture is defined context_t context() const; + + /// @returns A non-owning proxy object for the CUDA device on which this texture is defined device_t device() const; public: // friendship @@ -174,7 +177,7 @@ class texture_view { bool owning_; }; - +///@cond inline bool operator==(const texture_view& lhs, const texture_view& rhs) noexcept { return lhs.raw_handle() == rhs.raw_handle(); @@ -184,7 +187,7 @@ inline bool operator!=(const texture_view& lhs, const texture_view& rhs) noexcep { return lhs.raw_handle() != rhs.raw_handle(); } - +///@nocond namespace texture { inline texture_view wrap( diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index a569015e..448f0ea7 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -2,7 +2,7 @@ * @file * * @brief Fundamental CUDA-related type definitions. - + * * This is a common file for all definitions of fundamental CUDA-related types, * some shared by different APIs. * @@ -64,16 +64,23 @@ namespace cuda { */ using status_t = CUresult; +/// A size type for use throughout the wrappers library (except when specific API functions +/// limit the size further) using size_t = ::std::size_t; -/** - * The index or number of dimensions of an entity (as opposed to the extent in any - * dimension) - typically just 0, 1, 2 or 3. - */ +/// The index or number of dimensions of an entity (as opposed to the extent in any +/// dimension) - typically just 0, 1, 2 or 3. using dimensionality_t = size_t; +/** + * @namespace array + * + * CUDA facilities for interpolating access to multidimensional array objects, + * in particular via the @ref array_t class. + */ namespace array { +/// An individual dimension extent for an array using dimension_t = size_t; /** @@ -87,12 +94,11 @@ using dimension_t = size_t; template struct dimensions_t; -/** - * Dimensions for 3D CUDA arrays - */ +/// Dimensions for 3D CUDA arrays template<> struct dimensions_t<3> // this almost-inherits cudaExtent { + /// The three constituent individual dimensions, named dimension_t width, height, depth; constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_, dimension_t depth_) @@ -117,25 +123,32 @@ struct dimensions_t<3> // this almost-inherits cudaExtent // 2. It doesn't do anything except construct the plain struct - as of CUDA 10 at least } + /// The total number of elements in a 3D entity with these dimensions constexpr __host__ __device__ size_t volume() const { return width * height * depth; } + + /// @copydoc volume constexpr __host__ __device__ size_t size() const { return volume(); } + + /// The number of non-trivial dimensions in this object (axes in which an object with + /// these dimensions is not "flat") constexpr __host__ __device__ dimensionality_t dimensionality() const { return ((width > 1) + (height> 1) + (depth > 1)); } - // Named constructor idioms - + /// Named constructor idiom: Dimensions for an equi-lateral cube static constexpr __host__ __device__ dimensions_t cube(dimension_t x) { return dimensions_t{ x, x, x }; } + + ///Named constructor idiom: Dimensions for a one-element object: "Flat" in all individual + // dimensions static constexpr __host__ __device__ dimensions_t zero() { return cube(0); } }; -/** - * Dimensions for 2D CUDA arrays - */ +/// Dimensions for 2D CUDA arrays template<> struct dimensions_t<2> { + /// The two constituent individual dimensions, named; no "depth" for the 2D case. dimension_t width, height; constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_) @@ -159,8 +172,14 @@ struct dimensions_t<2> return *this; } + /// The total number of elements in a 2D entity with these dimensions constexpr __host__ __device__ size_t area() const { return width * height; } + + /// @copydoc area constexpr __host__ __device__ size_t size() const { return area(); } + + /// The number of non-trivial dimensions in this object (axes in which an object with + /// these dimensions is not "flat") constexpr __host__ __device__ dimensionality_t dimensionality() const { return ((width > 1) + (height> 1)); @@ -168,21 +187,33 @@ struct dimensions_t<2> // Named constructor idioms + /// Named constructor idiom: Dimensions for an equi-lateral cube static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x }; } + + ///Named constructor idiom: Dimensions for a one-element object: "Flat" in all individual + // dimensions static constexpr __host__ __device__ dimensions_t zero() { return square(0); } }; } // namespace array /** - * @brief Definitions and functionality related to CUDA events (not - * including the event wrapper type @ref event_t itself) + * @namespace event + * + * CUDA timing functionality, via events and their related code (not including + * the event wrapper type @ref event_t itself) */ namespace event { /// The CUDA driver's raw handle for events using handle_t = CUevent; +/** + * @namespace event::ipc + * + * Definitions and functionality related to CUDA events (not + * including the event wrapper type @ref event_t itself) + */ namespace ipc { /// The CUDA driver's raw handle for events passed between processes @@ -193,10 +224,11 @@ using handle_t = CUipcEventHandle; } // namespace event /** - * @brief Definitions and functionality related to CUDA streams (not + * @namespace stream + * + * Definitions and functionality related to CUDA streams (not * including the device wrapper type @ref stream_t itself) */ - namespace stream { /// The CUDA driver's raw handle for streams @@ -279,7 +311,9 @@ struct dimensions_t // this almost-inherits dim3 // as constexpr, so it isn't __host__ __device__ operator dim3(void) const { return { x, y, z }; } + /// The number of total elements in a 3D object with these dimensions constexpr __host__ __device__ size_t volume() const { return static_cast(x) * y * z; } + /// Number of dimensions in which this dimension structure is non-trivial, i.e. coordinates can /// have more than a single value constexpr __host__ __device__ dimensionality_t dimensionality() const @@ -289,11 +323,20 @@ struct dimensions_t // this almost-inherits dim3 // Named constructor idioms + /// Dimensions of an equi-lateral 3D cube static constexpr __host__ __device__ dimensions_t cube(dimension_t x) { return dimensions_t{ x, x, x }; } + + /// Dimensions of an equi-lateral 2D square, with a trivial third dimension static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x, 1 }; } + + /// Dimensions of a 1D line, with the last two dimensions trivial static constexpr __host__ __device__ dimensions_t line(dimension_t x) { return dimensions_t{ x, 1, 1 }; } + + /// Dimensions of a single point - trivial in in all axes static constexpr __host__ __device__ dimensions_t point() { return dimensions_t{ 1, 1, 1 }; } + /// @returns true if the dimensions on the left-hand side divide, elementwise, those + /// on the right-hand side static bool divides(dimensions_t lhs, dimensions_t rhs) { return @@ -378,6 +421,7 @@ struct overall_dimensions_t } }; +///@cond constexpr bool operator==(overall_dimensions_t lhs, overall_dimensions_t rhs) noexcept { return (lhs.x == rhs.x) and (lhs.y == rhs.y) and (lhs.z == rhs.z); @@ -396,6 +440,7 @@ constexpr overall_dimensions_t operator*(dimensions_t grid_dims, block_dimension grid_dims.z * overall_dimension_t { block_dims.z }, }; } +///@nocond /** * Composite dimensions for a grid - in terms of blocks, then also down @@ -405,18 +450,24 @@ struct composite_dimensions_t { grid::dimensions_t grid; grid::block_dimensions_t block; + /// @returns The overall dimensions of the entire grid as a single 3D entity constexpr overall_dimensions_t flatten() const { return grid * block; } + /// @returns The total number of threads over all blocks of the grid constexpr size_t volume() const { return flatten().volume(); } + /// @returns the number of axes in which the grid overall has non-trivial dimension constexpr size_t dimensionality() const { return flatten().dimensionality(); } + /// A named constructor idiom for the composite dimensions of a single-block grid + /// with a single-thread block static constexpr composite_dimensions_t point() { return { dimensions_t::point(), block_dimensions_t::point() }; } }; +///@cond constexpr bool operator==(composite_dimensions_t lhs, composite_dimensions_t rhs) noexcept { return (lhs.grid == rhs.grid) and (lhs.block == rhs.block); @@ -426,6 +477,7 @@ constexpr bool operator!=(composite_dimensions_t lhs, composite_dimensions_t rhs { return not (lhs == rhs); } +///@nocond } // namespace grid @@ -438,6 +490,7 @@ constexpr bool operator!=(composite_dimensions_t lhs, composite_dimensions_t rhs namespace memory { #if CUDA_VERSION >= 10020 +/// Named boolean constants used in memory read and write access control enum : bool { read_enabled = true, read_disabled = false, @@ -445,10 +498,19 @@ enum : bool { write_disabled = false }; +/** + * An access permission specification structure for physical allocations + * performed by / registered with the CUDA driver + * + * @note defined for each of use instead of the raw CUDA driver's @ref CUmemAccess_flags + * type. + */ struct access_permissions_t { bool read : 1; bool write : 1; + /// This allows passing this access permissions structure to CUDA driver memory + /// access control API functions operator CUmemAccess_flags() const noexcept { return read ? @@ -456,6 +518,7 @@ struct access_permissions_t { CU_MEM_ACCESS_FLAGS_PROT_NONE; } + /// A named constructor idiom for access permissions static access_permissions_t from_access_flags(CUmemAccess_flags access_flags) { access_permissions_t result; @@ -464,6 +527,7 @@ struct access_permissions_t { return result; } + /// A named constructor idiom for allowing both reading and writing static constexpr access_permissions_t read_and_write() { return access_permissions_t{ read_enabled, write_enabled }; @@ -472,7 +536,8 @@ struct access_permissions_t { namespace physical_allocation { -// TODO: Consider simply aliasing CUmemAllocationHandleType and using constexpr const's or anonymous enums +/// The different kinds of memory handles the CUDA driver recognizes, and can possibly +/// utilize enum class shared_handle_kind_t : ::std::underlying_type::type { #if CUDA_VERSION >= 11020 no_export = CU_MEM_HANDLE_TYPE_NONE, @@ -494,6 +559,7 @@ template <> struct shared_handle_type_helper } // namespace detail_ +/// The raw handle for different kinds of shared memory the CUDA driver recognizes template using shared_handle_t = typename detail_::shared_handle_type_helper::type; @@ -501,12 +567,22 @@ using shared_handle_t = typename detail_::shared_handle_type_helper= 10020 #if CUDA_VERSION >= 11020 -namespace pool { /** - * @note Unsupported for now + * @namespace pool + * + * pool-based allocation functionality */ +namespace pool { + +/// Raw CUDA driver handle for a memory pool; avoid using these and prefer @ref memory::pool_t}. using handle_t = CUmemoryPool; + +/// The different kinds of memory handles the CUDA driver recognizes, and can +/// possibly be used with pool allocation using shared_handle_kind_t = physical_allocation::shared_handle_kind_t; + +/// The raw handle for different kinds of shared memory the CUDA driver recognizes, for +/// possible use with memory pools using physical_allocation::shared_handle_t; } // namespace pool @@ -514,23 +590,26 @@ using physical_allocation::shared_handle_t; namespace pointer { +/// Raw CUDA driver choice type for attributes of pointers using attribute_t = CUpointer_attribute; } // namespace pointer namespace device { -/** - * The numeric type which can represent the range of memory addresses on a CUDA device. - */ +/// The numeric type which can represent the range of memory addresses on a CUDA device. +/// As these addresses are typically just part of the single, unified all-system memory +/// space, this should be the same type as a system memory address' numeric equivalent. + using address_t = CUdeviceptr; static_assert(sizeof(void *) == sizeof(device::address_t), "Unexpected address size"); /** - * Return a pointers address as a numeric value of the type appropriate for device - * @param device_ptr a pointer into device memory - * @return a reinterpretation of @p device_address as a numeric address. + * @returns a cast of a proper pointer into a numeric address in device memory space + * (which is usually just a part of the unified all-system memory space) + * + * @note Typically, this is just a reinterpretation of the same value. */ inline address_t address(const void* device_ptr) noexcept { @@ -538,16 +617,36 @@ inline address_t address(const void* device_ptr) noexcept return reinterpret_cast(device_ptr); } +/** + * @returns The numeric address of the beginning of a memory region + * + * @note Typically, this is just a reinterpretation of the same value. + */ inline address_t address(memory::const_region_t region) noexcept { return address(region.start()); } } // namespace device +/// @returns a cast of a numeric address in device memory space (which, in recent CUDA +/// versions, is just a part of the unified all-system memory space) into a proper +/// pointer. inline void* as_pointer(device::address_t address) noexcept { static_assert(sizeof(void*) == sizeof(device::address_t), "Incompatible sizes for a void pointer and memory::device::address_t"); return reinterpret_cast(address); } +/** + * @namespace shared + * + * A memory space whose contents is shared by all threads in a CUDA kernel block, + * but specific to each kernel block separately. Shared memory is stored in a special + * area in each of a GPU's individual physical cores (symmetric multiprocessors, or + * SM's, in NVIDIA parlance) - which is also the area used for L1 cache. One may + * therefore think of it as L1-cache-like memory which holds arbitrary data rather + * than a cached copy of any global memory locations. It is only usable in + * device-side (= kernel) code, but control and inspection of its size is part of + * the CUDA API functionality. + */ namespace shared { /** @@ -566,19 +665,42 @@ using size_t = unsigned; } // namespace shared +/** + * @namespace managed + * + * Paged memory accessible in both device-side and host-side code by triggering transfers + * of pages between physical system memory and physical device memory. + * + * This type of memory, also known as _unified_ memory, appears within a unified, all-system + * address space - and is used with the same address range on the host and on all relevant + * CUDA devices on a system. It is paged, so that it may exceed the physical size of a CUDA + * device's global memory. The CUDA driver takes care of "swapping" pages "out" from a device + * to host memory or "swapping" them back "in", as well as of propagation of changes between + * devices and host-memory. + * + * @note For more details, see + * + * Unified Memory for CUDA Beginners on the + * Parallel4All blog. + */ namespace managed { +/// The choices of which categories CUDA devices must a managed memory region be visible to enum class initial_visibility_t { to_all_devices, to_supporters_of_concurrent_managed_access, }; +/// A specifier of one of the attributes of managed memory regions using range_attribute_t = CUmem_range_attribute; } // namespace managed #if CUDA_VERSION >= 11070 +/// Memory barriers can apply to different scops of scheduled work which must reach it before +/// continuing enum class barrier_scope_t : typename ::std::underlying_type::type { + /// All wortk on device = CU_STREAM_MEMORY_BARRIER_TYPE_GPU, system = CU_STREAM_MEMORY_BARRIER_TYPE_SYS }; @@ -586,10 +708,13 @@ enum class barrier_scope_t : typename ::std::underlying_type= 10000 /** + * @namespace memory::external + * * Representation of memory resources external to CUDA */ namespace external { +/// Raw CUDA driver handle for an external memory resource represented for the driver using handle_t = CUexternalMemory; /** @@ -692,15 +817,13 @@ using attribute_t = CUdevice_P2PAttribute; namespace context { +/// Raw CUDA driver handle for a context; see {@ref context_t}. using handle_t = CUcontext; using flags_t = unsigned; -/** - * Scheduling policies the Runtime API may use when the host-side - * thread it is running in needs to wait for results from a certain - * device - */ +/// Scheduling policies the CUDA driver may use when the host-side thread it is +/// running in needs to wait for results from a certain device or context. enum host_thread_sync_scheduling_policy_t : unsigned int { /** @@ -762,10 +885,12 @@ using flags_t = context::flags_t; namespace primary_context { +/// Raw CUDA driver handle for a device's primary context using handle_t = cuda::context::handle_t; } // namespace primary_context +/// @copydoc context::host_thread_sync_scheduling_policy_t using host_thread_sync_scheduling_policy_t = context::host_thread_sync_scheduling_policy_t; } // namespace device @@ -785,21 +910,27 @@ inline T identity_cast(U&& x) } // namespace detail_ +/// The CUDA-driver-specific representation of a UUID value; see also {@ref device_t::uuid()} using uuid_t = CUuuid; namespace module { +/// Raw CUDA driver handle of a module of compiled code; see @ref module_t using handle_t = CUmodule; } // namespace module namespace kernel { +/// Raw CUDA driver selector of a kernel attribute using attribute_t = CUfunction_attribute; + +/// The uniform type the CUDA driver uses for all kernel attributes; it is typically more +/// appropriate to use @ref cuda::kernel_t methods, which also employ more specific, +/// appropriate types. using attribute_value_t = int; -// TODO: Is this really only for kernels, or can any device-side function be -// represented by a CUfunction? +// A raw CUDA driver handle for a kernel; prefer using the @ref cuda::kernel_t type. using handle_t = CUfunction; } // namespace kernel diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp index d054217f..a9b65a99 100644 --- a/src/cuda/api/unique_region.hpp +++ b/src/cuda/api/unique_region.hpp @@ -151,6 +151,7 @@ class unique_region : public region_t { namespace device { +/// A unique region of device-global memory using unique_region = memory::unique_region; namespace detail_ { @@ -164,20 +165,30 @@ inline unique_region make_unique_region(const context::handle_t context_handle, } // namespace detail_ /** - * @brief Allocate an array in device-global memory and return an owning class for it + * @brief Allocate a region in device-global memory * - * @param num_bytes the size in bytes of the allocated region - */ -///@{ -/** - * @param device The CUDA device in whose global memory to make the allocation. + * @param context The context within which (and in the device global memory + * of which) to make the allocation + * @param num_bytes Size of the region to be allocated, in bytes + * @returns An owning RAII/CADRe object for the allocated memory region */ unique_region make_unique_region(const context_t& context, size_t num_bytes); + /** - * @param context The CUDA context in which to make the allocation. + * @brief Allocate a region in device-global memory + * + * @param device The device in the global memory of which to make the allocation + * @returns An owning RAII/CADRe object for the allocated memory region */ unique_region make_unique_region(const device_t& device, size_t num_bytes); +/** + * @brief Allocate a region in device-global memory within the primary context + * of the current CUDA device + * + * @param device The device in the global memory of which to make the allocation + * @returns An owning RAII/CADRe object for the allocated memory region + */ unique_region make_unique_region(size_t num_bytes); ///}@ @@ -198,19 +209,21 @@ inline device::unique_region make_unique_region(const device_t& device, size_t n namespace host { +/// A unique region of pinned host memory using unique_region = memory::unique_region; -inline unique_region make_unique_region( - const context_t& context, - size_t num_bytes, - allocation_options options = allocation_options{}); -inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +/** + * @brief Allocate a physical-address-pinned region of system memory + * + * @returns An owning RAII/CADRe object for the allocated memory region + */ inline unique_region make_unique_region(size_t num_bytes); } // namespace host namespace managed { +/// A unique region of managed memory, see @ref cuda::memory::managed using unique_region = memory::unique_region; namespace detail_ { @@ -226,14 +239,34 @@ inline unique_region make_unique_region( } // namespace detail_ +/** + * @copydoc make_unique_region(size_t num_bytes) + * + * @param context A context, to set when allocating the memory region, for whatever + * association effect that may have. + */ inline unique_region make_unique_region( const context_t& context, size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @copydoc make_unique_region(size_t num_bytes) + * + * @param device A context, whose primary context will be current when allocating + * the memory region, for whatever association effect that may have. + */ inline unique_region make_unique_region( const device_t& device, size_t num_bytes, initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); + +/** + * @brief Allocate a region of managed memory, accessible both from CUDA devices + * and from the CPU. + * + * @returns An owning RAII/CADRe object for the allocated managed memory region + */ inline unique_region make_unique_region( size_t num_bytes); diff --git a/src/cuda/define_specifiers.hpp b/src/cuda/define_specifiers.hpp index a907ea11..4234041b 100644 --- a/src/cuda/define_specifiers.hpp +++ b/src/cuda/define_specifiers.hpp @@ -8,6 +8,8 @@ #ifdef __CUDACC__ +/// Shorthands for CUDA-specific function declaration decorations +///@{ #ifndef CUDA_FD #define CUDA_FD __forceinline__ __device__ #endif @@ -57,5 +59,6 @@ #ifndef CUDA_H #define CUDA_H #endif +///@} #endif // __CUDACC__ diff --git a/src/cuda/nvtx/profiling.hpp b/src/cuda/nvtx/profiling.hpp index 546835a3..af8a7518 100644 --- a/src/cuda/nvtx/profiling.hpp +++ b/src/cuda/nvtx/profiling.hpp @@ -40,7 +40,6 @@ #endif #endif - #include #include #include @@ -51,10 +50,16 @@ namespace cuda { // Note: No implementation for now for nvtxStringHandle_t's - +/** + * @namespace profiling + * + * Interaction with NVIDIA's profiler, particularly tagging, marking and + * indications of entities it will pick up and register/display. + */ namespace profiling { namespace detail_ { + inline void set_message(nvtxEventAttributes_t &attrs, const char *c_str) noexcept { attrs.messageType = NVTX_MESSAGE_TYPE_ASCII; @@ -75,16 +80,23 @@ inline void set_message(nvtxEventAttributes_t &attrs, nvtxStringHandle_t rsh) no } // namespace detail_ +/** + * @namespace range + * + * definitions related to profiled ranges and the @ref range_t class + */ namespace range { -enum class type_t { unspecified, kernel, pci_express_transfer }; - /** - * The range handle is actually `nvtxRangeId_t`; but - other than this typedef, - * we don't need to include the nVIDIA Toolkit Extensions headers at all here, - * and can leave them within the implementation only. + * Types of profiled ranges we recognize + * + * @note The profiler itself does not distinguish between these types of ranges; + * we use them for different styling */ -using handle_t = ::std::uint64_t; +enum class type_t { unspecified, kernel, pci_express_transfer }; + +/// The raw handle of a CUDA profiling range +using handle_t = nvtxRangeId_t; } // namespace range @@ -94,10 +106,18 @@ using handle_t = ::std::uint64_t; * profiling information. */ struct color_t { + /// A profiler color corresponds to a 32-bit value using underlying_type = ::std::uint32_t; + + /// Each color channel is an 8-bit value using channel_value = ::std::uint8_t; + + /// A profiler color is made up of three color channels and a transparency + /// or "alpha" channel channel_value alpha, red, green, blue; + /// Construct a profiler color value from a numeric value (typically, + /// an 8-hex-digit literal) static constexpr color_t from_hex(underlying_type raw_argb) noexcept { return { static_cast ((raw_argb >> 24) & 0xFF), @@ -106,7 +126,8 @@ struct color_t { static_cast ((raw_argb >> 0) & 0xFF), }; } - operator underlying_type() const noexcept { return as_hex(); } + + /// @return the numeric value corresponding to this profiler color underlying_type as_hex() const noexcept { return @@ -115,6 +136,12 @@ struct color_t { static_cast(green) << 8 | static_cast(blue) << 0; } + + /// @copydoc as_hex() + operator underlying_type() const noexcept { return as_hex(); } + + /// Some basic colors, for convenience + ///@{ static constexpr color_t Black() noexcept { return from_hex(0x00000000); } static constexpr color_t White() noexcept { return from_hex(0x00FFFFFF); } static constexpr color_t FullRed() noexcept { return from_hex(0x00FF0000); } @@ -129,8 +156,14 @@ struct color_t { static constexpr color_t DarkGreen() noexcept { return from_hex(0x00008800); } static constexpr color_t DarkBlue() noexcept { return from_hex(0x00000088); } static constexpr color_t DarkYellow() noexcept { return from_hex(0x00888800); } + ///@} }; +/** + * @namespace mark + * + * defintions related to marking individual time points in the profiler timeline + */ namespace mark { namespace detail_ { @@ -156,6 +189,8 @@ nvtxEventAttributes_t create_attributes(const CharT* description, color_t color) } // namespace detail_ +/// Mark a single point on the profiler timeline, giving +/// it also a color and some descriptive text template void point(const CharT* description, color_t color = color_t::Black()) { @@ -165,6 +200,15 @@ void point(const CharT* description, color_t color = color_t::Black()) nvtxMarkEx(&attrs); } +/** + * Mark the beginning of a range on the profiler timeline, giving + * it also a color and some descriptive text + * + * @param type the range type - an unused parameter + * + * @return a handle representing the range, which can be used to mark its + * endpoint + */ template range::handle_t range_start( const CharT* description, @@ -180,6 +224,8 @@ range::handle_t range_start( return range_handle; } +/// Mark the end of a range, using the handle obtained when previously +/// marking its beginning. inline void range_end(range::handle_t range_handle) { static_assert(::std::is_same::value, @@ -189,18 +235,14 @@ inline void range_end(range::handle_t range_handle) } // namespace mark -/** - * Start CUDA profiling for the current process - */ +/// Start CUDA profiling for the current process inline void start() { auto status = cuProfilerStart(); throw_if_error_lazy(status, "Starting CUDA profiling"); } -/** - * Stop CUDA profiling for the current process - */ +/// Stop CUDA profiling for the current process inline void stop() { auto status = cuProfilerStop(); @@ -215,7 +257,9 @@ namespace cuda { namespace profiling { /** - * A RAII class whose scope of existence is reflected as a range in the profiler. + * A RAII/CADRe class whose scope of existence is reflected as a range in the + * profiler. + * * Use it in the scope in which you perform some interesting operation, e.g. * perform a synchronous I/O operation (and have it conclude of course), or * launch and synch several related kernels. @@ -333,23 +377,29 @@ inline void name(::std::thread::id host_thread_id, const char* name) } // namespace detail_ /** - * @brief Have the profiler refer to the current thread, or another host - * thread, using a specified string identifier (rather than its numeric ID). + * @brief Have the profiler refer to a given host thread, using a specified string + * identifier (rather than its numeric ID). * * @param[in] host_thread A C++-recognized thread to name in profiling results * @param[in] name The name to use for the specified thread */ -///@{ template void name(const ::std::thread& host_thread, const CharT* name); +/** + * @brief Have the profiler refer to the current thread using a specified string + * identifier (rather than its numeric ID). + * + * @param[in] host_thread A C++-recognized thread to name in profiling results + * @param[in] name The name to use for the specified thread + */ template void name_this_thread(const CharT* name) { detail_::name(::std::this_thread::get_id(), name); } -///@} +/// Have the profile assign a name to a certain stream template void name(const stream_t& stream, const CharT* name) { @@ -357,6 +407,7 @@ void name(const stream_t& stream, const CharT* name) detail_::name_stream(stream.handle(), name); } +/// Have the profile assign a name to a certain event template void name(const event_t& event, const CharT* name) { @@ -364,6 +415,7 @@ void name(const event_t& event, const CharT* name) detail_::name_stream(event.handle(), name); } +/// Have the profile assign a name to a certain CUDA device template void name(const device_t& device, const CharT* name) { diff --git a/src/cuda/rtc/compilation_options.hpp b/src/cuda/rtc/compilation_options.hpp index f24c0adc..c3182c6c 100644 --- a/src/cuda/rtc/compilation_options.hpp +++ b/src/cuda/rtc/compilation_options.hpp @@ -62,7 +62,11 @@ inline cpp_dialect_t cpp_dialect_from_name(const char* dialect_name) noexcept(fa namespace error { +/// Possible ways of handling a potentially problematic finding by the compiler +/// in the program source code enum handling_method_t { raise_error = 0, suppress = 1, warn = 2 }; + +/// Errors, or problematic findings, by the compiler are identified by a number of this type using number_t = unsigned; namespace detail_ { @@ -135,8 +139,7 @@ struct compilation_options_base_t { return set_target(device.compute_capability()); } ///@} - -}; +}; // compilation_options_base_t /// Commonly-used phrases regarding the optimization level (e.g. from GCC's /// command-line arguments), translated into the numeric levels the RTC @@ -150,7 +153,12 @@ enum : rtc::optimization_level_t { maximum_optimization = O3 }; - +/** + * Options to be passed to one of the NVIDIA JIT compilers along with a program's source code + * + * @note at the raw API level, the options are passed in a simpler form, less convenient for + * modification. This is handled by the @ref program_t class. + */ template class compilation_options_t; @@ -160,57 +168,111 @@ class compilation_options_t final : public common_ptx_compilation_options_t { public: + ///@cond using parent = compilation_options_base_t; using parent::parent; + ///@nocond - /** - * Makes the PTX compiler run without producing any CUBIN output - for verifying - * the input PTX only. - */ + /// Makes the PTX compiler run without producing any CUBIN output (for PTX verification only) bool parse_without_code_generation { false }; + + /// Allow the JIT compiler to perform expensive optimizations using maximum available resources + /// (memory and compile-time). bool allow_expensive_optimizations_below_O2 { false }; + + /** + * Compile as patch code for CUDA tools. + * + * @note : + * + * 1. Cannot Shall not be used in conjunction with @ref parse_without_code_generation + * or {@ref compile_extensible_whole_program}. + * 2. Some PTX ISA features may not be usable in this compilation mode. + */ bool compile_as_tools_patch { false }; + + /** + * Expecting only whole-programs to be directly usable, allow some calls to not be resolved + * until device-side linking is performed (see @ref link_t). + */ bool compile_extensible_whole_program { false }; + + /// Enable the contraction of multiplcations-followed-by-additions (or subtractions) into single + /// fused instructions (FMAD, FFMA, DFMA) bool use_fused_multiply_add { true }; + + /// Print code generation statistics along with the compilation log bool verbose { false }; + + /** + * Prevent the compiler from merging consecutive basic blocks + * (@ref https://en.wikipedia.org/wiki/Basic_block) into a single block. + * + * Normally, the compiler attempts to merge consecutive "basic blocks" as part of its optimization + * process. However, for debuggable code this is very confusing. + */ bool dont_merge_basicblocks { false }; + + /// The equivalent of suppressing all findings which currently trigger a warning bool disable_warnings { false }; + + /// Disable use of the "optimizer constant bank" feature bool disable_optimizer_constants { false }; + + /// Prevents the optimizing away of the return instruction at the end of a program (a kernel?), + /// making it possible to set a breakpoint just at that point bool return_at_end_of_kernel { false }; + + /// Generate relocatable references for variables and preserve relocations generated for them in + /// the linked executable. bool preserve_variable_relocations { false }; + + /// Warnings about situations likely to result in poor performance + /// or other problems. struct { bool double_precision_ops { false }; bool local_memory_use { false }; bool registers_spill_to_local_memory { false }; - bool indeterminable_stack_size {true }; + bool indeterminable_stack_size { true }; // Does the PTX compiler library actually support this? ptxas does, but the PTX compilation API // doesn't mention it bool double_demotion { false }; } situation_warnings; + + /// Limits on the number of registers which generated object code (of different kinds) is allowed + /// to use struct { optional kernel {}; optional device_function {}; } maximum_register_counts; + /// Options for fully-specifying a caching mode struct caching_mode_spec_t { optional> load {}; optional> store {}; }; struct { + /// The caching mode to be used for instructions which don't specify a caching mode caching_mode_spec_t default_ {}; + /// A potential forcing of the caching mode, overriding even what instructions themselves + /// specify caching_mode_spec_t forced {}; } caching_modes; + /// Get a reference to the caching mode the compiler will be told to use as the default, for load + /// instructions which don't explicitly specify a particular caching mode. optional>& default_load_caching_mode() override { return caching_modes.default_.load; } + + /// Get the caching mode the compiler will be told to use as the default, for load instructions + /// which don't explicitly specify a particular caching mode. optional> default_load_caching_mode() const override { return caching_modes.default_.load; } - /** * Specifies the GPU kernels, or `__global__` functions in CUDA-C++ terms, or `.entry` * functions in PTX terms, for which code must be generated. @@ -222,7 +284,7 @@ class compilation_options_t final : ::std::vector<::std::string>& entries(); ::std::vector<::std::string>& kernels(); ::std::vector<::std::string>& kernel_names(); -}; +}; // compilation_options_t template <> class compilation_options_t final : @@ -303,10 +365,8 @@ class compilation_options_t final : */ bool use_fused_multiply_add { true }; - /** - * Make use of fast math operations. Implies use_fused_multiply_add, - * not use_precise_division and not use_precise_square_root. - */ + /// Make use of fast math operations. Implies use_fused_multiply_add, + /// not use_precise_division and not use_precise_square_root. bool use_fast_math { false }; /** @@ -316,48 +376,39 @@ class compilation_options_t final : */ bool link_time_optimization { false }; - /** - * Implicitly add the directories of source files (TODO: Which source files?) as - * include file search paths. - */ + /// Implicitly add the directories of source files (TODO: Which source files?) as include + /// file search paths. bool source_dirs_in_include_path { true }; - /** - * Enables more aggressive device code vectorization in the LTO IR optimizer. - */ + ///Enables more aggressive device code vectorization in the LTO IR optimizer. bool extra_device_vectorization { false }; - /** - * Set language dialect to C++03, C++11, C++14 or C++17. - * - */ + /// The dialect of C++ as which the compiler will be forced to interpret the program source code optional language_dialect { }; + /// Preprocessor macros to have the compiler define, without specifying a particular value ::std::unordered_set<::std::string> no_value_defines; + + /// Preprocessor macros to tell the compiler to specifically _un_define. ::std::unordered_set<::std::string> undefines; + + /// Preprocessor macros to have the compiler define to specific values ::std::unordered_map<::std::string,::std::string> valued_defines; + /// Have the compiler treat all warnings as though they were suppressed, and print nothing bool disable_warnings { false }; - /** - * Treat all kernel pointer parameters as if they had the `restrict` (or `__restrict`) qualifier. - */ + /// Treat all kernel pointer parameters as if they had the `restrict` (or `__restrict`) qualifier. bool assume_restrict { false }; - /** - * Assume functions without an explicit specification of their execution space are `__device__` - * rather than `__host__` functions. - */ + /// Assume functions without an explicit specification of their execution space are `__device__` + /// rather than `__host__` functions. bool default_execution_space_is_device { false }; - /** - * Display (error) numbers for warning (and error?) messages, in addition to the message itself. - */ + /// Display (error) numbers for warning (and error?) messages, in addition to the message itself. bool display_error_numbers { true }; - /** - * Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler"). - */ + /// Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler"). ::std::string ptxas; /** @@ -421,18 +472,23 @@ class compilation_options_t final : public: // "shorthands" for more complex option setting - compilation_options_t& set_language_dialect(cpp_dialect_t dialect) + /// Let the compiler interpret the program source code using its default-assumption for the + /// C++ language dialect + compilation_options_t& clear_language_dialect() { - language_dialect = dialect; + language_dialect = {}; return *this; } - compilation_options_t& clear_language_dialect() + /// Set which dialect of the C++ language the compiler will try to interpret + /// the program source code as. + compilation_options_t& set_language_dialect(cpp_dialect_t dialect) { - language_dialect = {}; + language_dialect = dialect; return *this; } + /// @copydoc set_language_dialect(cpp_dialect_t) compilation_options_t& set_language_dialect(const char* dialect_name) { return (dialect_name == nullptr or *dialect_name == '\0') ? @@ -440,6 +496,7 @@ class compilation_options_t final : set_language_dialect(detail_::cpp_dialect_from_name(dialect_name)); } + /// @copydoc set_language_dialect(cpp_dialect_t) compilation_options_t& set_language_dialect(const ::std::string& dialect_name) { return dialect_name.empty() ? @@ -447,24 +504,30 @@ class compilation_options_t final : set_language_dialect(dialect_name.c_str()); } + /// Ignore compiler findings of the specified number (rather than warnings about + /// them or raising an error) compilation_options_t& suppress_error(error::number_t error_number) { error_handling_overrides[error_number] = error::suppress; return *this; } + /// Treat compiler findings of the specified number as an error (rather than + /// suppressing them or just warning about them) compilation_options_t& treat_as_error(error::number_t error_number) { error_handling_overrides[error_number] = error::raise_error; return *this; } + /// Treat compiler findings of the specified number as warnings (rather than + /// raising an error or ignoring them) compilation_options_t& warn_about(error::number_t error_number) { error_handling_overrides[error_number] = error::warn; return *this; } -}; +}; // compilation_options_t namespace detail_ { @@ -488,7 +551,6 @@ MarshalTarget& operator<<(MarshalTarget& mt, detail_::opt_start_t& op return mt; } - /** * Uses the streaming/left-shift operator (<<) to render a delimited sequence of * command-line-argument-like options (with or without a value as relevant) @@ -508,8 +570,7 @@ void process( if (opts.generate_relocatable_device_code) { marshalled << opt_start << "--compile-only"; } if (opts.compile_as_tools_patch) { marshalled << opt_start << "--compile-as-tools-patch"; } if (opts.generate_debug_info) { marshalled << opt_start << "--device-debug"; } - if (opts.generate_source_line_info) - { marshalled << opt_start << "--generate-line-info"; } + if (opts.generate_source_line_info) { marshalled << opt_start << "--generate-line-info"; } if (opts.compile_extensible_whole_program) { marshalled << opt_start << "--extensible-whole-program"; } if (not opts.use_fused_multiply_add) { marshalled << opt_start << "--fmad false"; } if (opts.verbose) { marshalled << opt_start << "--verbose"; } @@ -671,6 +732,15 @@ void process( } } +/** + * Finalize a compilation options "building" object into a structure passable to some of the + * CUDA JIT compilation APIs + * + * @tparam Kind The kind of JITable program options to render + * + * @return A structure of multiple strings, passable to various CUDA APIs, but no longer + * easy to modify and manipulate. + */ template inline marshalled_options_t marshal(const compilation_options_t& opts) { @@ -683,6 +753,14 @@ inline marshalled_options_t marshal(const compilation_options_t& opts) } // namespace detail_ +/** + * Finalize a set of compilation options into the form of a string appendable to a command-line + * + * @tparam Kind The kind of JITable program options to render + * + * @return a string made up of command-line options - switches and options with arguments, + * designated by single or double dashes. + */ template inline ::std::string render(const compilation_options_t& opts) { diff --git a/src/cuda/rtc/compilation_output.hpp b/src/cuda/rtc/compilation_output.hpp index 45e457c5..c95359bf 100644 --- a/src/cuda/rtc/compilation_output.hpp +++ b/src/cuda/rtc/compilation_output.hpp @@ -23,8 +23,18 @@ class context_t; ///@endcond namespace rtc { + +/** + * The output produced by a compilation process by one of the CUDA libraries, + * including any byproducts. + * + * @tparam Kind Which language was compiled to produce the result + * + * @note A failed compilation is also a (useful) kind of compilation output. + */ template class compilation_output_t; + } // namespace rtc ///@cond @@ -41,6 +51,7 @@ class module_t; namespace module { +/// Build a contextualized module from the results of a successful compilation template inline module_t create( const context_t& context, @@ -49,9 +60,6 @@ inline module_t create( } // namespace module -/** - * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library. - */ namespace rtc { namespace program { @@ -237,6 +245,11 @@ template <> inline status_t destroy_and_return_status(handle } // namespace program +/** + * @namespace compilation_output + * + * Definitions relating to and supporting the @ref compilation_output_t class + */ namespace compilation_output { namespace detail_ { @@ -256,12 +269,14 @@ inline compilation_output_t wrap( } // namespace compilation_output /** - * Wrapper class for the result of an NVRTC compilation (including the program handle) - - * whether it succeeded or failed due to errors in the program itself. + * The result of the compilation of an {@ref rtc::program_t}, whether successful or + * failed, with any related byproducts. + * + * @note This class _may_ own a low-level program handle. * - * @note This class _may_ own an NVRTC low-level program handle. * @note If compilation failed due to apriori-invalid arguments - an exception will - * have been thrown. The only failure this class may represent + * have been thrown. A failure indication in this class indicates a program whose + * compilation actually _took place_ and ended with a failure. */ template class compilation_output_base_t { @@ -271,27 +286,30 @@ class compilation_output_base_t { using status_type = status_t; public: // getters + + /// @returns `true` if the compilation resulting in this output had succeeded bool succeeded() const { return succeeded_; } + + /// @returns `true` if the compilation resulting in this output had failed bool failed() const { return not succeeded_; } + + /// @returns `true` if the compilation resulting in this output had succeeded, `false` otherwise operator bool() const { return succeeded_; } const ::std::string& program_name() const { return program_name_; } handle_type program_handle() const { return program_handle_; } public: // non-mutators - // Unfortunately, C++'s standard string class is very inflexible, - // and it is not possible for us to get it to have an appropriately- - // sized _uninitialized_ buffer. We will therefore have to use - // a clunkier return type. - // - // ::std::string log() const - /** - * Obtain a copy of the log of the last compilation + * Write a copy of the program compilation log into a user-provided buffer * - * @note This will fail if the program has never been compiled. + * @param[inout] buffer A writable buffer large enough to contain the compilation log + * + * @return the buffer passed in (which has now been overwritten with the log) + * + * @note This will fail if the program has never been compiled, or if the + * buffer is not large enough to hold the complete log (plus nul character). */ - ///@{ span log(span buffer) const { size_t size = program::detail_::get_log_size(program_handle_, program_name_.c_str()); @@ -305,6 +323,13 @@ class compilation_output_base_t { return { buffer.data(), size }; } + /** + * Obtain a copy of the compilation log + * + * @returns an owning container with a nul-terminated copy of the log + * + * @note This will fail if the program has never been compiled. + */ unique_span log() const { size_t size = program::detail_::get_log_size(program_handle_, program_name_.c_str()); @@ -317,11 +342,34 @@ class compilation_output_base_t { result[size] = '\0'; return result; } - ///@} #if CUDA_VERSION >= 11010 - virtual unique_span cubin() const = 0; + /** + * Write the CUBIN result of the last compilation into a buffer. + * + * @param[inout] buffer A writable buffer large enough to contain the compiled + * program's CUBIN code. + * @return The sub-buffer, starting at the beginning of @p buffer, containing + * exactly the compiled program's CUBIN (i.e. sized down to fit the contents) + * + * @note This will fail if the program has never been compiled; due to + * compilation failure and also due to LTO/linking failure. + */ virtual span cubin(span buffer) const = 0; + + /** + * Obtain a copy of the CUBIN code resulting from the program compilation + * + * @returns an owning container with a copy of the CUBIN code + * + * @note This will fail if the program has never been compiled; if the compilation + * target was a virtual architecture (in which case only PTX is available); due to + * compilation failure and also due to LTO/linking failure. + */ + virtual unique_span cubin() const = 0; + + /// @returns true if the program has been successfully compiled, with the result + /// containing CUBIN code. virtual bool has_cubin() const = 0; #endif @@ -382,9 +430,17 @@ class compilation_output_t : public compilation_output_base_t ptx(span buffer) const { @@ -398,6 +454,13 @@ class compilation_output_t : public compilation_output_base_t ptx() const { size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str()); @@ -410,8 +473,9 @@ class compilation_output_t : public compilation_output_base_t : public compilation_output_base_t= 11010 - /** - * Obtain a copy of the CUBIN result of the last compilation. - * - * @note CUBIN output is not available when compiling for a virtual architecture only. - * Also, it may be missing in cases such as compilation failure or link-time - * optimization compilation. - * @note This will fail if the program has never been compiled. - */ - ///@{ span cubin(span buffer) const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -456,7 +511,6 @@ class compilation_output_t : public compilation_output_base_t(result.data(), program_handle_, program_name_.c_str()); return result; } - ///@} bool has_cubin() const override { @@ -467,13 +521,12 @@ class compilation_output_t : public compilation_output_base_t 0); } - #endif #if CUDA_VERSION >= 11040 /** - * Obtain a copy of the LTO IR result of the last compilation - the intermediate - * representation used for link-time optimization + * Write the LTO IR result of the last compilation - the intermediate + * representation used for link-time optimization - into a buffer * * @throws ::std::invalid_argument if the supplied buffer is too small to hold * the program's LTO IR. @@ -485,7 +538,6 @@ class compilation_output_t : public compilation_output_base_t lto_ir(span buffer) const { size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str()); @@ -498,6 +550,15 @@ class compilation_output_t : public compilation_output_base_t lto_ir() const { size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str()); @@ -510,12 +571,9 @@ class compilation_output_t : public compilation_output_base_t : public compilation_output_base_t : public compilation_output_base_t : public compilation_output_base_t { bool own_handle); public: // non-mutators - /** - * Obtain a copy of the CUBIN result of the last compilation. - * - * @note This will fail if the program has never been compiled. - */ - ///@{ span cubin(span buffer) const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -593,7 +645,6 @@ class compilation_output_t : public compilation_output_base_t { return { buffer.data(), size }; } -public: // non-mutators unique_span cubin() const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); @@ -606,7 +657,6 @@ class compilation_output_t : public compilation_output_base_t { result[size] = '\0'; return result; } - ///@} bool has_cubin() const override { @@ -695,6 +745,8 @@ template<> inline module_t create( #endif // CUDA_VERSION >= 11010 +/// Build a module from the results of a successful compilation, in the primary context +/// of the specified device template inline module_t create( device_t& device, diff --git a/src/cuda/rtc/error.hpp b/src/cuda/rtc/error.hpp index d880a2b5..8ac427b9 100644 --- a/src/cuda/rtc/error.hpp +++ b/src/cuda/rtc/error.hpp @@ -121,7 +121,6 @@ namespace rtc { template class runtime_error : public ::std::runtime_error { public: - ///@cond // TODO: Constructor chaining; and perhaps allow for more construction mechanisms? runtime_error(status_t error_code) : ::std::runtime_error(describe(error_code)), @@ -132,7 +131,6 @@ class runtime_error : public ::std::runtime_error { ::std::runtime_error(::std::move(what_arg) + ": " + describe(error_code)), code_(error_code) { } - ///@endcond runtime_error(status::named_t error_code) : runtime_error(static_cast>(error_code)) { } runtime_error(status::named_t error_code, const ::std::string& what_arg) : @@ -190,6 +188,15 @@ inline void throw_if_error(rtc::status_t status) noexcept(false) if (is_failure(status)) { throw rtc::runtime_error(status); } } +/** + * Throws a @ref ::cuda::rtc::runtime_error exception if the status is not success + * + * @note The rationale for this macro is that neither the exception, nor its constructor + * arguments, are evaluated on the "happy path"; and that cannot be achieved with a + * function - which genertally/typically evaluates its arguments. To guarantee this + * lazy evaluation with a function, we would need exception-construction-argument-producing + * lambdas, which we would obviously rather avoid. + */ #define throw_if_rtc_error_lazy(Kind, status__, ... ) \ do { \ ::cuda::rtc::status_t tie_status__ = static_cast<::cuda::rtc::status_t>(status__); \ diff --git a/src/cuda/rtc/program.hpp b/src/cuda/rtc/program.hpp index 323eefd5..820b85aa 100644 --- a/src/cuda/rtc/program.hpp +++ b/src/cuda/rtc/program.hpp @@ -23,6 +23,19 @@ namespace program { namespace detail_ { +/** + * Create a new program object from source code + * + * @tparam Kind We can create a program with any one of the (two...) kinds of supported + * source code + * @param program_name arbitrary identifier to recognize the program by; it's suggested + * not to get too crazy + * @param program_source The source code of the program, possibly with include directives + * in the case of C++ + * @param num_headers The number of pairs of header "file" names and header content strings + * @param header_sources Pointers to nul-terminated per-header source code + * @param header_names Pointers to nul-terminated names of the different headers + */ template inline program::handle_t create( const char *program_name, @@ -53,6 +66,7 @@ template <> inline program::handle_t create( } #endif // CUDA_VERSION >= 11010 +/// Have NVRTC add the specified global to those accessible/usable after compilation inline void register_global(handle_t program_handle, const char *global_to_register) { auto status = nvrtcAddNameExpression(program_handle, global_to_register); @@ -60,6 +74,8 @@ inline void register_global(handle_t program_handle, const char *globa + " with " + identify(program_handle)); } +/// Splice multiple raw string options together with a ' ' separator character, and +/// surrounding each option with double-quotes inline ::std::string get_concatenated_options(const const_cstrings_span& raw_options) { static ::std::ostringstream oss; @@ -186,14 +202,22 @@ class base_t { using status_type = status_t; public: // getters - + /// Getters for the constituent object fields + ///@{ const ::std::string& name() const { return name_; } + + /// Full source code of the program (possibly with preprocessor directives such as `#include`) const char* source() const { return source_; } + + /// Compilation options to be passed to the JIT compiling library along with the source code const compilation_options_t& options() const { return options_; } // TODO: Think of a way to set compilation options without having // to break the statement, e.g. if options had a reflected enum value // or some such arrangement. + + /// Compilation options to be passed to the JIT compiling library along with the source code compilation_options_t& options() { return options_; } + ///@} public: // constructors and destructor explicit base_t(::std::string name) : name_(::std::move(name)) {}; @@ -225,6 +249,9 @@ class program_t; * @note This class is a "reference type", not a "value type". Therefore, making changes * to the program is a const-respecting operation on this class. * + * @note Many of this class' methods could have been placed in the base class, and are + * "duplicated" in program_t - except that they return the program object itself, + * allowing for builder-pattern-like use. */ template <> class program_t : public program::detail_::base_t { @@ -233,26 +260,50 @@ class program_t : public program::detail_::base_t { public: // getters + /// Names of the "memoized"/off-file-system headers made available to the program + /// (and usable as identifiers for `#include` directives) const_cstrings_span header_names() const { return { headers_.names.data(),headers_.names.size()}; } + + /// Sources of the "memoized"/off-file-system headers made available to the program + /// (and usable as identifiers for `#include` directives) + /// + /// @note each header source string corresponds to the name of the same index + /// accessible via {@ref header_names()}. const_cstrings_span header_sources() const { return { headers_.sources.data(), headers_.sources.size()}; } + + /// @returns the number of memoized/off-the-file-system headers made available + /// to the program size_t num_headers() const { return headers_.sources.size(); } public: // setters - duplicated with PTX programs + /// Have the compilation produce code for devices with a given compute capability program_t& set_target(device::compute_capability_t target_compute_capability) { options_.set_target(target_compute_capability); return *this; } + + /// Have the compilation produce code for devices with the same compute capability + /// as a given device program_t& set_target(const device_t& device) { return set_target(device.compute_capability());} + + /// Have the compilation produce code for devices with the same compute capability + /// as the device of a given context program_t& set_target(const context_t& context) { return set_target(context.device()); } + + /// Remove all compute capabilities which were chosen to have code produced for them + /// by the compilation program_t& clear_targets() { options_.targets_.clear(); return *this; } + + /// Remove all compute capabilities which were chosen to have code produced for them + /// by the compilation template program_t& set_targets(Container target_compute_capabilities) { @@ -262,12 +313,21 @@ class program_t : public program::detail_::base_t { } return *this; } + + /// Have the compilation also produce code for devices with a given compute + /// capability program_t& add_target(device::compute_capability_t target_compute_capability) { options_.add_target(target_compute_capability); return *this; } + + /// Have the compilation also produce code for devices with the same compute + /// capability as a given device void add_target(const device_t& device) { add_target(device.compute_capability()); } + + /// Have the compilation also produce code for devices with the same compute + /// capability as the device of a given context void add_target(const context_t& context) { add_target(context.device()); } program_t& set_source(const char* source) { source_ = source; return *this; } @@ -285,7 +345,7 @@ class program_t : public program::detail_::base_t { protected: template - static inline void check_string_type() + static void check_string_type() { using no_cref_string_type = typename ::std::remove_const::type>::type; static_assert( @@ -315,6 +375,15 @@ class program_t : public program::detail_::base_t { void add_header_source_(::std::string&& source) = delete; public: // mutators + /** + * Adds another "memoized" header to the program + * + * @param name The header name for use in `#include` directives + * @param source The full source code of the header "file", possibly with its own + * preprocessor directives (e.g. `#include`). + * + * @note "names" with path separators can be used, but are discouraged + */ template program_t& add_header(String1&& name, String2&& source) { @@ -323,6 +392,15 @@ class program_t : public program::detail_::base_t { return *this; } + /** + * Adds another "memoized" header to the program + * + * @param name_and_source A pair of strings, one being the name for use in `#include` + * directives, the other being the full source code of the header "file", possibly + * with its own preprocessor directives (e.g. `#include`). + * + * @note "names" with path separators can be used, but are discouraged + */ template program_t& add_header(const ::std::pair& name_and_source) { @@ -331,6 +409,7 @@ class program_t : public program::detail_::base_t { return *this; } + /// @copydoc add_header(String1&&, String2&&) template program_t& add_header(::std::pair&& name_and_source) { @@ -339,6 +418,15 @@ class program_t : public program::detail_::base_t { return add_header(name_and_source); } + /** + * Adds multiple "memoized" headers to the program + * + * @param name Names of the headers, for use in `#include` directives + * @param source The full source code of each of the header "file", possibly + * with their own preprocessor directivess. + * + * @note "names" with path separators can be used, but are discouraged + */ template const program_t& add_headers( RangeOfNames header_names, @@ -357,7 +445,7 @@ class program_t : public program::detail_::base_t { #ifndef NDEBUG if (new_num_headers > ::std::numeric_limits::max()) { throw ::std::invalid_argument("Cannot use more than " - + ::std::to_string(::std::numeric_limits::max()) + " headers."); + + ::std::to_string(::std::numeric_limits::max()) + " headers."); } #endif headers_.names.reserve(new_num_headers); @@ -371,6 +459,15 @@ class program_t : public program::detail_::base_t { return *this; } + /** + * Adds multiple "memoized" headers to the program + * + * @param name_and_source_pairs A container of pairs of strings, each being made + * up of a name for use in `#include` directives, and the full source code + * of the header "file", possibly with its own preprocessor directives. + * + * @note "names" with path separators can be used, but are discouraged + */ template program_t& add_headers(RangeOfNameAndSourcePairs&& named_header_pairs) { @@ -392,6 +489,15 @@ class program_t : public program::detail_::base_t { return *this; } + /** + * Replaces the set of "memoized" headers used in the program's compilation + * + * @param name Names of the headers, for use in `#include` directives + * @param source The full source code of each of the header "file", possibly + * with their own preprocessor directivess. + * + * @note "names" with path separators can be used, but are discouraged + */ template const program_t& set_headers( RangeOfNames&& names, @@ -401,6 +507,15 @@ class program_t : public program::detail_::base_t { return add_headers(names, sources); } + /** + * Replaces the set of "memoized" headers used in the program's compilation + * + * @param name_and_source_pairs A container of pairs of strings, each being made + * up of a name for use in `#include` directives, and the full source code + * of the header "file", possibly with its own preprocessor directives. + * + * @note "names" with path separators can be used, but are discouraged + */ template program_t& set_headers(RangeOfNameAndSourcePairs&& named_header_pairs) { @@ -409,6 +524,7 @@ class program_t : public program::detail_::base_t { return *this; } + /// Removes all "memoized" headers to be used in the program's compilation program_t& clear_headers() { headers_.names.clear(); @@ -416,12 +532,21 @@ class program_t : public program::detail_::base_t { return *this; } + /// Clears any forced values of compilation options, reverting the compilation + /// to the default values program_t& clear_options() { options_ = {}; return *this; } public: // TODO: Support specifying all compilation option in a single string and parsing it + /** + * Compiles the program represented by this object (which, until this point, is + * just a bunch of unrelated sources and options). + * + * @note Carefully examines the @ref compilation_output_t class to understand what + * exactly the compilation produces. + */ compilation_output_t compile() const { if ((source_ == nullptr or *source_ == '\0') and options_.preinclude_files.empty()) { @@ -447,19 +572,19 @@ class program_t : public program::detail_::base_t { * @note The name must continue to exist past the compilation of the program - as it is not copied, * only referenced */ - ///@{ program_t& add_registered_global(const char* unmangled_name) { globals_to_register_.push_back(unmangled_name); return *this; } + + /// @copydoc add_registered_global(const char*) program_t& add_registered_global(const ::std::string& unmangled_name) { globals_to_register_.push_back(unmangled_name.c_str()); return *this; } // TODO: Accept string_view's with C++17 - ///@} /** * @brief Register multiple pre-mangled names of global, to make available for use @@ -472,7 +597,6 @@ class program_t : public program::detail_::base_t { * program - as they are not copied, only referenced. Thus, as a safety precaution, we * also assume the container continues to exist */ - ///@{ template program_t& add_registered_globals(const Container& globals_to_register) { @@ -483,6 +607,7 @@ class program_t : public program::detail_::base_t { return *this; } + /// @copydic add_registered_globals(const Container&) template program_t& add_registered_globals(Container&& globals_to_register) { @@ -491,7 +616,6 @@ class program_t : public program::detail_::base_t { "the possible passing of string-like objects at the end of their lifetime"); return add_registered_globals(static_cast(globals_to_register)); } - ///@} public: // constructors and destructor program_t(::std::string name) : base_t(::std::move(name)) {} @@ -500,9 +624,10 @@ class program_t : public program::detail_::base_t { ~program_t() = default; public: // operators - + ///@cond program_t& operator=(const program_t& other) = default; program_t& operator=(program_t&& other) = default; + ///@nocond protected: // data members struct { @@ -514,6 +639,16 @@ class program_t : public program::detail_::base_t { #if CUDA_VERSION >= 11010 +/** + * Wrapper class for a CUDA PTX (runtime-compilable) program + * + * @note This class is a "reference type", not a "value type". Therefore, making changes + * to the program is a const-respecting operation on this class. + * + * @note Many of this class' methods could have been placed in the base class, and are + * "duplicated" in program_t - except that they return the program object itself, + * allowing for builder-pattern-like use. + */ template <> class program_t : public program::detail_::base_t { public: // types @@ -523,14 +658,23 @@ class program_t : public program::detail_::base_t { public: // setters - duplicated with CUDA-C++/NVRTC programs + /// @copydoc program_t::set_target(device::compute_capability_t) program_t& set_target(device::compute_capability_t target_compute_capability) { options_.set_target(target_compute_capability); return *this; } + + /// @copydoc program_t::set_target(const device_t&) program_t& set_target(const device_t& device) { return set_target(device.compute_capability());} + + /// @copydoc program_t::set_target(const context_t&) program_t& set_target(const context_t& context) { return set_target(context.device()); } + + /// @copydoc program_t::clear_targets() program_t& clear_targets() { options_.targets_.clear(); return *this; } + + /// @copydoc program_t::set_targets(Container) template program_t& set_targets(Container target_compute_capabilities) { @@ -540,27 +684,40 @@ class program_t : public program::detail_::base_t { } return *this; } + + /// @copydoc program_t::add_target(device::compute_capability_t) program_t& add_target(device::compute_capability_t target_compute_capability) { options_.add_target(target_compute_capability); return *this; } + + /// @copydoc program_t::clear_targets() void add_target(const device_t& device) { add_target(device.compute_capability()); } + + /// @copydoc program_t::set_targets(Container) void add_target(const context_t& context) { add_target(context.device()); } - program_t& set_source(const char* source) { source_ = source; return *this; } + /// @copydoc program_t::set_source(char const*) + program_t& set_source(char const* source) { source_ = source; return *this; } + + /// @copydoc program_t::set_source(const ::std::string&) program_t& set_source(const ::std::string& source) { source_ = source.c_str(); return *this; } + + /// @copydoc program_t::set_options(compilation_options_t) program_t& set_options(compilation_options_t options) { options_ = ::std::move(options); return *this; } + /// @copydoc program_t::clear_options() program_t& clear_options() { options_ = {}; return *this; } public: // TODO: Support specifying all compilation option in a single string and parsing it + /// @copydoc program_t::compile() compilation_output_t compile() const { if (source_ == nullptr or *source_ == '\0') { @@ -574,7 +731,6 @@ class program_t : public program::detail_::base_t { {option_ptrs.data(), option_ptrs.size()}); } - public: // constructors and destructor program_t(::std::string name) : parent(std::move(name)) {} program_t(const program_t&) = default; @@ -583,20 +739,27 @@ class program_t : public program::detail_::base_t { public: // operators + ///@cond program_t& operator=(const program_t& other) = default; program_t& operator=(program_t&& other) = default; + ///@nocond }; // class program_t #endif // CUDA_VERSION >= 11010 namespace program { +/** + * Create a new (not-yet-compiled) program without setting most of its + * constituent fields. + */ template inline program_t create(const char* program_name) { return program_t(program_name); } +/// @copydoc create (const char*) template inline program_t create(const ::std::string& program_name) { @@ -605,6 +768,12 @@ inline program_t create(const ::std::string& program_name) } // namespace program +/** + * @returns all compute capabilities supported as targets by NVRTC and (most likely) + * also by the PTX compilation library. + * + * @note the compute capabilities are returned in ascending order. + */ #if CUDA_VERSION >= 11020 inline unique_span supported_targets() diff --git a/src/cuda/rtc/types.hpp b/src/cuda/rtc/types.hpp index 1526a4c6..80f0de90 100644 --- a/src/cuda/rtc/types.hpp +++ b/src/cuda/rtc/types.hpp @@ -51,13 +51,20 @@ enum source_kind_t { // provide a container which may then be resized. /** - * @brief Real-time compilation of CUDA programs using the NVIDIA NVRTC library. + * @namespace rtc + * + * @brief Real-time compilation of programs using the NVIDIA libraries. */ namespace rtc { /// A span of C-style strings the contents of which must not be modified using const_cstrings_span = span; +/** + * @namespace program + * + * Definitions relating to source-code programs to be compiled + */ namespace program { namespace detail_ {