diff --git a/src/cuda/api/array.hpp b/src/cuda/api/array.hpp index 58a9ea49..4519cfd6 100644 --- a/src/cuda/api/array.hpp +++ b/src/cuda/api/array.hpp @@ -270,16 +270,20 @@ array_t wrap( return { device_id, context_handle, handle, dimensions }; } +/// Create a new (typed) CUDA array of the specified dimensions +///@{ +/// @param context ... in which the array is to be created template array_t create( const context_t& context, dimensions_t dimensions); +/// @param device ... in whose primary context the array is to be created template array_t create( const device_t& device, dimensions_t dimensions); - +///@} } // namespace array diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp index 1933d66e..fa747d67 100644 --- a/src/cuda/api/context.hpp +++ b/src/cuda/api/context.hpp @@ -861,6 +861,7 @@ inline context_t get_with_fallback_push() } // namespace current +/// @return true if the context is the primary context of its device bool is_primary(const context_t& context); namespace detail_ { diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp index d3faaf07..9899afe4 100644 --- a/src/cuda/api/launch_config_builder.hpp +++ b/src/cuda/api/launch_config_builder.hpp @@ -179,6 +179,8 @@ class launch_config_builder_t { } } + /// Use the information specified for the builder to figure out the grid and block + /// dimensions with which the kernel is to be launched grid::composite_dimensions_t get_composite_dimensions() const noexcept(false) { auto result = get_unvalidated_composite_dimensions(); @@ -189,6 +191,10 @@ class launch_config_builder_t { } public: + /// Use the information specified to the builder (and defaults for the unspecified + /// information) to finalize the construction of a kernel launch configuration, + /// which can then be passed along with the kernel to a kernel-launching function, + /// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch launch_configuration_t build() const { auto result = launch_configuration_t{ get_composite_dimensions() }; @@ -392,6 +398,7 @@ class launch_config_builder_t { } + /// Set the dimensions for each block in the intended kernel launch grid launch_config_builder_t& block_dimensions( grid::block_dimension_t x, grid::block_dimension_t y = 1, @@ -400,8 +407,17 @@ class launch_config_builder_t { return block_dimensions(grid::block_dimensions_t{x, y, z}); } + /// Set the block in the intended kernel launch grid to be uni-dimensional + /// with a specified size launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); } + /** + * Set the intended kernel launch grid to have 1D blocks, of the maximum + * length possible given the information specified to the builder. + * + * @note This will fail if neither a kernel nor a device have been chosen + * for the launch. + */ launch_config_builder_t& use_maximum_linear_block() { grid::block_dimension_t max_size; @@ -424,6 +440,16 @@ class launch_config_builder_t { } #if CUDA_VERSION >= 12000 + /** + * Set the dimensions of multi-block clusters within the grid. + * + * @note There is only a small number of possible dimension combinations of clusters; + * and this function does _not_ guarantee to fail immediately if you specify an + * invalid such combination. + * + * @note This setting does not affect the overall dimensions of the grid in terms of + * blocks. + */ launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims) { #ifndef NDEBUG @@ -434,6 +460,9 @@ class launch_config_builder_t { } #endif + /// Set the dimension of the grid for the intended kernel launch, in terms + /// of blocks + ///@{ launch_config_builder_t& grid_dimensions(grid::dimensions_t dims) { #ifndef NDEBUG @@ -447,6 +476,7 @@ class launch_config_builder_t { return *this; } + ///@} launch_config_builder_t& grid_dimensions( grid::dimension_t x, grid::dimension_t y = 1, @@ -455,9 +485,17 @@ class launch_config_builder_t { return grid_dimensions(grid::dimensions_t{x, y, z}); } + /// Set the grid for the intended launch to be one-dimensional, with a specified number + /// of blocks + ///@{ launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); } launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); } + ///@} + + /// Set the overall number of _threads_, in each dimension, of all blocks + /// in the grid of the intended kernel launch + ///@{ launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims) { #ifndef NDEBUG @@ -474,16 +512,30 @@ class launch_config_builder_t { { return overall_dimensions(grid::overall_dimensions_t{x, y, z}); } + ///@} + /// Set the intended launch grid to be linear, with a specified overall number of _threads_ + /// over all (1D) blocks in the grid launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); } + /** + * Set whether or blocks may synchronize with each other or not + * + * @note recall that even "non-cooperative" blocks can still access the same global memory + * locations, and can use atomic operations on such locations for (slow) synchronization. + */ launch_config_builder_t& block_cooperation(bool cooperation) { thread_block_cooperation = cooperation; return *this; } + /// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently + /// (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); } + + /// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will + /// work entirely independently (atomic global memory operations notwithstanding) launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); } launch_config_builder_t& dynamic_shared_memory_size( @@ -493,11 +545,18 @@ class launch_config_builder_t { return *this; } + /// Indicate that the intended launch should not allocate any shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. launch_config_builder_t& no_dynamic_shared_memory() { return dynamic_shared_memory_size(memory::shared::size_t(0)); } + /// Indicate that the intended launch should allocate a certain amount of shared + /// memory for the kernel to use beyond the static amount necessitated + /// by its (compiled) code. + ///@{ launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size) { #ifndef NDEBUG @@ -512,13 +571,32 @@ class launch_config_builder_t { { return dynamic_shared_memory_size(size); } + ///@} + /** + * Indicate that the intended launch should allocate additional shared + * memory for the kernel to use beyond the static amount necessitated + * by its (compiled) code - with the amount to be determined based on + * the block size + * + * @param shared_mem_size_determiner a function determining the dynamic + * shared memory size given the kernel launch block size + */ launch_config_builder_t& dynamic_shared_memory( kernel::shared_memory_size_determiner_t shared_mem_size_determiner) { return dynamic_shared_memory_size(shared_mem_size_determiner); } + /** + * Indicate that the specified wrapped kernel will be the one launched + * with the configuration to be produced by this object. Such an indication + * provides this object with information about the device and context in + * which the kernel is to be launched, and ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Calling this method obviates a call to the @ref device() method. + */ launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr) { if (device_ and kernel_->device_id() != device_.value()) { @@ -533,6 +611,15 @@ class launch_config_builder_t { return *this; } + /** + * Indicate that the intended kernel launch would occur on (some stream in + * some context on) the specified device. Such an indication provides this + * object with some information regarding ranges of possible values for + * certain parameters (e.g. shared memory size, dimensions). + * + * @note Do not call both this and the @ref kernel() method; prefer just that one. + */ + ///@{ launch_config_builder_t& device(const device::id_t device_id) { if (kernel_ and kernel_->device_id() != device_id) { @@ -548,7 +635,11 @@ class launch_config_builder_t { { return this->device(device.id()); } + ///@} + /// Clear the association with a specific kernel (which may have been + /// set using the @ref kernel method) + ///@{ launch_config_builder_t& kernel_independent() { kernel_ = nullptr; @@ -559,13 +650,14 @@ class launch_config_builder_t { kernel_ = nullptr; return *this; } + ///@} /** - * @brief THis will use information about the kernel, the already-set block size, + * @brief This will use information about the kernel, the already-set block size, * and the device to create a unidimensional grid of blocks to exactly saturate * the CUDA device's capacity for simultaneous active blocks. * - * @note This will _not_ set the block size - unlike + * @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}. */ launch_config_builder_t& saturate_with_active_blocks() { @@ -584,6 +676,14 @@ class launch_config_builder_t { return *this; } + /** + * @brief This will use information about the kernel and the device to define + * a minimum launch grid which should guarantee maximum occupancy of the GPU's + * multiprocessors. + * + * @note A builder after this call _will_ set the block dimensions - unlike + * {@ref saturate_with_active_blocks()} . + */ launch_config_builder_t& min_params_for_max_occupancy() { if (not (kernel_)) { @@ -600,6 +700,7 @@ class launch_config_builder_t { } }; // launch_config_builder_t +/// A slightly shorter-named construction idiom for @ref launch_config_builder_t inline launch_config_builder_t launch_config_builder() { return {}; } } // namespace cuda diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp index f891f8e0..e213e7ed 100644 --- a/src/cuda/api/launch_configuration.hpp +++ b/src/cuda/api/launch_configuration.hpp @@ -58,7 +58,16 @@ enum class cluster_scheduling_policy_t { }; #endif +/** + * The full set of possible configuration parameters for launching a kernel on a GPU. + * + * @note Consider using a @ref launch_configuration_builder_t to incrementally construct + * these structs. + * + * @note this structure must be constructed with at least the grid and block dimensions. + */ struct launch_configuration_t { + /// Dimensions of the launch grid in blocks, and of the individual blocks in the grid. grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } }; /** @@ -160,6 +169,20 @@ struct launch_configuration_t { constexpr launch_configuration_t(const launch_configuration_t&) = default; constexpr launch_configuration_t(launch_configuration_t&&) = default; + /** + * Constructors corresponding to the CUDA runtime API's triple-chevron launch + * syntax: + * + * my_kernel <<< grid_Dims, block_dims, dynamic_shmem_size, my_stream >>> ( + * arg1, arg2, arg3, etc); + * + * ... where the specified aspects of the launch configuration are the dimensions + * and the dynamic shared memory size. + * + * @note The choices of stream and kernel function are _not_ part of the launch + * configuration. + */ + ///@{ constexpr launch_configuration_t( grid::composite_dimensions_t grid_and_block_dimensions, memory::shared::size_t dynamic_shared_mem = 0u @@ -184,12 +207,14 @@ struct launch_configuration_t { grid::block_dimensions_t(block_dims), dynamic_shared_mem) { } + ///@} CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default; CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default; }; #if __cplusplus < 202002L +///@cond constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept { return @@ -210,6 +235,7 @@ constexpr bool operator!=(const launch_configuration_t lhs, const launch_configu { return not (lhs == rhs); } +///@endcond #endif namespace detail_ { diff --git a/src/cuda/api/link.hpp b/src/cuda/api/link.hpp index ef55571e..bd028477 100644 --- a/src/cuda/api/link.hpp +++ b/src/cuda/api/link.hpp @@ -20,14 +20,14 @@ namespace cuda { ///@cond class device_t; - class module_t; - class link_t; ///@endcond +/// Definitions related to CUDA linking-processes, captured by the @ref link_t wrapper class namespace link { +/// Kinds of images which can be used by the linker (some may require driver compilation work) enum class input_kind_t { cubin, /// Compiled device-class-specific device code ptx, /// PTX (microarchitecture-inspecific intermediate representation) @@ -36,9 +36,14 @@ enum class input_kind_t { library, /// An archive of objects files with embedded device code; a `.a` file }; +/// A raw CUDA driver handle for a linking-process using handle_t = CUlinkState; -// TODO: Check if the linking has been completed! +/** + * @brief Wrap an existing CUDA link-process in a @ref link_t wrapper class instance. + * + * @todo : Consider checking if the linking has already been completed! + */ inline link_t wrap( device::id_t device_id, context::handle_t context_handle, @@ -48,21 +53,19 @@ inline link_t wrap( inline link_t create(const void *image, const link::options_t &options); -// TODO: Use a clase-class with C++17 of later, made up of the two classes here +/// Definitions relating to inputs to CUDA linking-processes namespace input { -/** - * A typed, named, image in memory which can be used as an input to a runtime - * CUDA linking process. - */ +/// A typed, named, image in memory which can be used as an input to a runtime CUDA linking-process struct image_t : memory::region_t { - const char *name; - link::input_kind_t type; + const char *name; /// Link images are attached a name when registered in a linking-process + link::input_kind_t type; /// type of contents found in the memory region }; +/// A typed, named, image in a file which can be used as an input to a runtime CUDA linking-process struct file_t { - const char *path; // TODO: Use a proper path in C++14 and later - link::input_kind_t type; + const char *path; + link::input_kind_t type; /// type of contents found in the file }; } // namespace input @@ -115,7 +118,16 @@ class link_t { return memory::region_t{cubin_output_start, cubin_output_size}; } - // TODO: Replace this with methods which take wrapper classes. + /** + * Add another linkable image, from memory, to this linking-process + * + * @param[in] image Memory region containing the image + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ void add(link::input::image_t image, const link::options_t &ptx_compilation_options = {}) const { auto marshalled_options = link::detail_::marshal(ptx_compilation_options); @@ -134,6 +146,17 @@ class link_t { + ::std::to_string(static_cast(image.type)) + " to a link."); } + /** + * Add another linkable image, from a file, to this linking-process + * + * @param[in] file_input Path of the image file to be added + * @param[in] ptx_compilation_options Options for compiling PTX code to cubin, if necessary, + * before linking. + * + * @note some types of linkable images are not, in fact, even compiled - but can be compiled + * by the driver with the specified @p options. + */ + ///@{ void add_file(link::input::file_t file_input, const link::options_t &options) const { auto marshalled_options = link::detail_::marshal(options); @@ -156,6 +179,7 @@ class link_t { return add_file(path.c_str(), file_contents_type); } #endif + ///@} protected: // constructors @@ -219,6 +243,7 @@ class link_t { namespace link { +/// Create a new link-process (before adding any compiled images or or image-files) inline link_t create(const link::options_t &options = link::options_t{}) { handle_t new_link_handle; @@ -241,13 +266,12 @@ inline link_t create(const link::options_t &options = link::options_t{}) do_take_ownership); } -// TODO: Check if the linking has been completed! inline link_t wrap( - device::id_t device_id, - context::handle_t context_handle, - link::handle_t handle, - const link::options_t &options, - bool take_ownership) noexcept + device::id_t device_id, + context::handle_t context_handle, + link::handle_t handle, + const link::options_t & options, + bool take_ownership) noexcept { return link_t{device_id, context_handle, handle, options, take_ownership}; } diff --git a/src/cuda/api/link_options.hpp b/src/cuda/api/link_options.hpp index a2e17d59..0c895e06 100644 --- a/src/cuda/api/link_options.hpp +++ b/src/cuda/api/link_options.hpp @@ -1,7 +1,8 @@ /** * @file * - * @brief Definitions and utility functions relating to just-in-time compilation and linking of CUDA code. + * @brief Definitions and utility functions relating to just-in-time compilation, assembly + * and linking of CUDA code. */ #pragma once #ifndef CUDA_API_WRAPPERS_ASSEMBLY_AND_LINK_OPTIONS_HPP_ @@ -21,17 +22,37 @@ class module_t; namespace link { +/// Possible strategies for obtaining fully-compiled binary code for a target device +/// when it is not immediately available. enum fallback_strategy_for_binary_code_t { + /// Prefer compiling available PTX code to produce fully-compiled binary code prefer_compiling_ptx = 0, + /// Prefer using existing fully-compiled (binary) code, for a compatible but + /// not identical target device prefer_using_compatible_binary = 1, }; namespace detail_ { +/// The CUDA driver's raw generic JIT-related option type using option_t = CUjit_option; +/** + * Mechanism for finalizing options into a format readily usable by the + * link_t wrapper (and by the `cuLink`- functions - but highly inconvenient + * for inspection and modification. + * + * @note Don't create these yourself unless you have to; use @ref options_t + * instead, and @ref options_t::marshal() when done, for completing the + * linking-process. If you must create them - use `push_back()` method + * repeatedly until done with all options. + */ struct marshalled_options_t { + /// The CUDA driver's expected type for number of link-related options using size_type = unsigned; + + /// The CUDA driver's enum for option identification has this many values - + /// and thus, there is need for no more than this many marshalled options constexpr static const size_type max_num_options { CU_JIT_NUM_OPTIONS }; protected: @@ -39,8 +60,6 @@ struct marshalled_options_t { ::std::array value_buffer; size_type count_ { 0 }; public: - size_type count() const { return count_; } - void push_back(option_t option) { if (count_ >= max_num_options) { @@ -76,7 +95,12 @@ struct marshalled_options_t { } public: - + /** + * This method (alone) is used to populate this structure. + * + * @note The class is not a standard container, and this method cannot be + * reversed or undone, i.e. there is no `pop_back()` or `pop()`. + */ template void push_back(option_t option, T value) { @@ -85,25 +109,48 @@ struct marshalled_options_t { // Now set value_buffer[count-1]... value_buffer[count_-1] = process_value(value); } + + /// These three methods yield what the CUDA driver actually expects: + /// Two matching raw buffers and their count of elements + ///@{ const option_t* options() const { return option_buffer.data(); } const void * const * values() const { return value_buffer.data(); } + size_type count() const { return count_; } + ///@} }; } // namespace detail_ +/** + * A convenience class for holding, setting and inspecting options for a CUDA binary code + * linking process - which may also involve PTX compilation. + * + * @note This structure does not let you set those options which the CUDA driver documentation + * describes as having internal purposes only. + */ struct options_t final : public rtc::common_ptx_compilation_options_t { + /// options related to logging the link-process struct { + /// Non-error information regarding the logging process (i.e. its "standard output" stream) optional> info; + + /// Information regarding errors in the logging process (i.e. its "standard error" stream) optional> error; + + /// Control whether the info and error logging will be verbose bool verbose; } logs; // Note: When this is true, the specific_target of the base class // is overridden + /// Instead of using explicitly-specified binary target, from + /// @ref common_ptx_compilation_options_t::specific_target - use the device of the current CUDA + /// context as the target for binary generation bool obtain_target_from_cuda_context { true }; - /// fallback behavior if a (matching cubin???) is not found + /// Possible strategy for obtaining fully-compiled binary code when it is not + /// simply available in the input to the link-process optional fallback_strategy_for_binary_code; // Ignoring the "internal purposes only" options; diff --git a/src/cuda/api/miscellany.hpp b/src/cuda/api/miscellany.hpp index 6d04f547..1913ac0d 100644 --- a/src/cuda/api/miscellany.hpp +++ b/src/cuda/api/miscellany.hpp @@ -30,6 +30,13 @@ inline void initialize_driver() throw_if_error_lazy(status, "Failed initializing the CUDA driver"); } +/** + * A mechanism for ensuring a @ref cuInit() call has been made, to use before making + * any other driver API calls. + * + * @note differs from simply calling `initialize_driver()` in that repeated calls + * from the same thread will avoid additional @ref cuInit() call. + */ inline void ensure_driver_is_initialized() { thread_local bool driver_known_to_be_initialized{false}; diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp index 37fca510..47a91aef 100644 --- a/src/cuda/api/peer_to_peer.hpp +++ b/src/cuda/api/peer_to_peer.hpp @@ -25,7 +25,6 @@ constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NAT constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise #endif - namespace detail_ { /** * @brief Get one of the numeric attributes for a(n ordered) pair of devices, @@ -76,8 +75,17 @@ namespace current { namespace peer_to_peer { +/** + * @brief Allow operations in one context to access the memory space of another context + * to access being enabled. + * + * @note Calling this function does not ensure access will succeed, as access might + * be impossible due to hardware and/or driver specifics. + */ void enable_access_to(const context_t &context, const context_t &peer_context); +/// Prevent operations in one context to access the memory space of another context +/// to access being enabled. void disable_access_to(const context_t &context, const context_t &peer_context); } // namespace peer_to_peer