From 24f622b376352a82963511f85ae164b10d8230d2 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sat, 14 Jan 2023 20:21:53 +0200 Subject: [PATCH] Fixes #175, #458: Added graph support, including stream capture and a graph node builder class. Also added two modified CUDA sample programs using graph support: * graphMemoryNodes * jacobiCudaGraphs --- CMakeLists.txt | 1 + examples/CMakeLists.txt | 7 + examples/common.hpp | 12 + .../jacobiCudaGraphs/jacobi.cu | 186 ++++ .../jacobiCudaGraphs/jacobi.h | 77 ++ .../jacobiCudaGraphs/jacobi_kernels.cuh | 172 ++++ .../jacobiCudaGraphs/main.cpp | 182 ++++ .../simpleCudaGraphs/simpleCudaGraphs.cu | 508 +++++++++++ examples/other/jitify/string_view.hpp | 34 +- src/cuda/api.hpp | 13 +- src/cuda/api/context.hpp | 42 + src/cuda/api/detail/for_each_argument.hpp | 26 + src/cuda/api/device.hpp | 68 ++ src/cuda/api/external.hpp | 4 + src/cuda/api/graph/identify.hpp | 100 +++ src/cuda/api/graph/instance.hpp | 504 +++++++++++ src/cuda/api/graph/node.hpp | 152 ++++ src/cuda/api/graph/node_builder.hpp | 606 +++++++++++++ src/cuda/api/graph/template.hpp | 829 ++++++++++++++++++ src/cuda/api/graph/typed_node.hpp | 558 ++++++++++++ src/cuda/api/memory_pool.hpp | 4 + src/cuda/api/multi_wrapper_impls/graph.hpp | 230 +++++ src/cuda/api/stream.hpp | 79 +- src/cuda/api/types.hpp | 78 +- 24 files changed, 4449 insertions(+), 23 deletions(-) create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp create mode 100644 examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu create mode 100644 src/cuda/api/detail/for_each_argument.hpp create mode 100644 src/cuda/api/graph/identify.hpp create mode 100644 src/cuda/api/graph/instance.hpp create mode 100644 src/cuda/api/graph/node.hpp create mode 100644 src/cuda/api/graph/node_builder.hpp create mode 100644 src/cuda/api/graph/template.hpp create mode 100644 src/cuda/api/graph/typed_node.hpp create mode 100644 src/cuda/api/multi_wrapper_impls/graph.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2fdfece9..69ceda5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ write_basic_package_version_file( COMPATIBILITY ${COMPAT_SETTING} ) + install( FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-api-wrappers-config-version.cmake" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-api-wrappers" diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5adaaadc..078b1591 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -144,6 +144,13 @@ if(USE_COOPERATIVE_GROUPS AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0") add_executable(binaryPartitionCG modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu) endif() add_executable(bandwidthtest modified_cuda_samples/bandwidthtest/bandwidthtest.cpp) +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.0") + add_executable(simpleCudaGraphs modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu) + add_executable(jacobiCudaGraphs + modified_cuda_samples/jacobiCudaGraphs/main.cpp + modified_cuda_samples/jacobiCudaGraphs/jacobi.cu + ) +endif() #---- add_executable(version_management by_api_module/version_management.cpp) diff --git a/examples/common.hpp b/examples/common.hpp index 4a7467ba..62e5a3a4 100644 --- a/examples/common.hpp +++ b/examples/common.hpp @@ -35,6 +35,18 @@ bool your_type_was_() { return true; } #define print_type_of(_x) your_type_was_() #endif +inline const char* ordinal_suffix(int n) +{ + static const char suffixes [4][5] = {"th", "st", "nd", "rd"}; + auto ord = n % 100; + if (ord / 10 == 1) { ord = 0; } + ord = ord % 10; + return suffixes[ord > 3 ? 0 : ord]; +} + +template +inline ::std::string xth(N n) { return ::std::to_string(n) + ordinal_suffix(n); } + const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref) { static const char* cache_preference_names[] = { diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu new file mode 100644 index 00000000..a583d5ec --- /dev/null +++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu @@ -0,0 +1,186 @@ +#include "jacobi_kernels.cuh" +#include "jacobi.h" + +#include +#include +#include +#include + +static void finalize_error( + const cuda::stream_t& stream, span d_sum, const cuda::launch_configuration_t& launch_config, + double& sum, int k, const span x_to_overwrite) +{ + stream.enqueue.memzero(d_sum); + auto final_error_launch_config = launch_config; + final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1; + auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size; + final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double); + // TODO: Double-check the original source to ensure we're using the right x here + stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data()); + stream.enqueue.copy(&sum, d_sum); + stream.synchronize(); + report_error_sum("GPU", k + 1, sum); +} + +template<> +double do_jacobi_inner( + const cuda::device_t &device, + const cuda::stream_t &stream, + span A, + span b, + float convergence_threshold, + int num_iterations, + span x, + span x_new, + span d_sum) +{ + auto launch_config = cuda::launch_config_builder() + .block_size(256) + .grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) + .build(); + + double sum; + + auto graph = cuda::graph::create(); + + using cuda::graph::node::kind_t; + + auto memset_node = [&] { + cuda::graph::node::parameters_t params; + params.value = 0; + params.width_in_bytes = 4; + params.region = d_sum; + return graph.insert.node(params); + }(); + + auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod); + struct { cuda::graph::node::parameters_t odd, even; } kernel_params = { + { jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) }, + { jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) }, + }; + auto jacobi_kernel_node = graph.insert.node(kernel_params.even); + + graph.insert.edge(memset_node, jacobi_kernel_node); + + auto memcpy_node = [&] { + cuda::memory::copy_parameters_t<3> params; + params.set_source(d_sum); + params.set_destination(&sum, 1); + params.set_extent(1); + params.clear_offsets(); + params.clear_rest(); + return graph.insert.node(params); + }(); + + graph.insert.edge(jacobi_kernel_node, memcpy_node); + + + cuda::graph::instance_t instance = graph.instantiate(); + +// std::cout << "settings node params for the kernel node with k == " << k << " and params.marshalled_arguments.size() = " +// << params.marshalled_arguments.size() << std::endl; + + for (int k = 0; k < num_iterations; k++) { + instance.launch(stream); + stream.synchronize(); + + if (sum <= convergence_threshold) { + auto x_to_overwrite = ((k & 1) == 0) ? x : x_new; + finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); + break; + } + // Odd iterations have an even value of k, since we start with k == 0; + // but - here we sent + const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd; + instance.set_node_parameters(jacobi_kernel_node, next_iteration_params); + } + return sum; +} + +template<> +double do_jacobi_inner( + const cuda::device_t &, + const cuda::stream_t &stream, + span A, + span b, + float convergence_threshold, + int num_iterations, + span x, + span x_new, + span d_sum) +{ + auto launch_config = cuda::launch_config_builder() + .block_size(256) + .grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) + .build(); + + ::std::unique_ptr instance_ptr{}; + + double sum = 0.0; + for (int k = 0; k < num_iterations; k++) { + stream.begin_capture(cuda::stream::capture::mode_t::global); + stream.enqueue.memzero(d_sum); + auto x_to_read = ((k & 1) == 0) ? x : x_new; + auto x_to_overwrite = ((k & 1) == 0) ? x_new : x; + stream.enqueue.kernel_launch(JacobiMethod, launch_config, + A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data()); + stream.enqueue.copy(&sum, d_sum); + auto graph = stream.end_capture(); + + if (instance_ptr == nullptr) { + auto instance = graph.instantiate(); + instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)}); + } + else { + instance_ptr->update(graph); + // Note: The original code tried to re-instantiate if the update + // of the instance failed, we don't do this. + } + stream.enqueue.graph_launch(*instance_ptr); + stream.synchronize(); + + if (sum <= convergence_threshold) { + finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); + break; + } + } + + return sum; +} + +template<> +double do_jacobi_inner( + const cuda::device_t &, + const cuda::stream_t &stream, + span A, + span b, + float convergence_threshold, + int num_iterations, + span x, + span x_new, + span d_sum) +{ + auto launch_config = cuda::launch_config_builder() + .block_size(256) + .grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) + .build(); + + double sum; + for (int k = 0; k < num_iterations; k++) { + stream.enqueue.memzero(d_sum); + auto x_to_read = ((k & 1) == 0) ? x : x_new; + auto x_to_overwrite = ((k & 1) == 0) ? x_new : x; + stream.enqueue.kernel_launch(JacobiMethod, launch_config, + A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data()); + stream.enqueue.copy(&sum, d_sum); + stream.synchronize(); + + if (sum <= convergence_threshold) { + finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); + break; + } + } + + return sum; +} + diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h new file mode 100644 index 00000000..9b95b1ba --- /dev/null +++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Eyal Rozenberg + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef JACOBI_H +#define JACOBI_H + +#define N_ROWS 512 + +#include + +#if __cplusplus >= 202001L +using span = std::span; +#else +using cuda::span; +#endif + +#define N_ROWS 512 + +enum computation_method_t { + graph_with_set_kernel_params = 0, + graph_with_exec_update = 1, + non_graph_gpu = 2, + cpu = 3 +}; + +inline const char* method_name(computation_method_t method) +{ + static const char* method_names[] = { + "graph_with_set_kernel_params", + "graph_with_exec_update", + "non_graph_gpu", + "cpu" + }; + return method_names[method]; +} + +void report_error_sum(const char* where, int num_iterations, double sum_on_cpu); + +template +double do_jacobi_inner( + const cuda:: device_t& device, + const cuda::stream_t &stream, + span A, + span b, + float conv_threshold, + int num_iterations, + span x, + span x_new, + span d_sum); + + +#endif // JACOBI_H \ No newline at end of file diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh new file mode 100644 index 00000000..a57a6de4 --- /dev/null +++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh @@ -0,0 +1,172 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Eyal Rozenberg + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "jacobi.h" + +#include + +namespace cg = cooperative_groups; + +// 8 Rows of square-matrix A processed by each CTA. +// This can be max 32 and only power of 2 (i.e., 2/4/8/16/32). +#define ROWS_PER_CTA 8 + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != + // NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + +static __global__ void JacobiMethod( + float const *A, + double const *b, + float const convergence_threshold, + double const *x, + double *x_new, + double *sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ double x_shared[N_ROWS]; // N_ROWS == n + __shared__ double b_shared[ROWS_PER_CTA + 1]; + + for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) { + x_shared[i] = x[i]; + } + + if (threadIdx.x < ROWS_PER_CTA) { + int k = threadIdx.x; + for (int i = k + (blockIdx.x * ROWS_PER_CTA); + (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) { + b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; + } + } + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; + (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) { + double rowThreadSum = 0.0; + for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) { + rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); + } + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + rowThreadSum += tile32.shfl_down(rowThreadSum, offset); + } + + if (tile32.thread_rank() == 0) { + atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); + } + } + + cg::sync(cta); + + if (threadIdx.x < ROWS_PER_CTA) { + cg::thread_block_tile tile8 = + cg::tiled_partition(cta); + double temp_sum = 0.0; + + int k = threadIdx.x; + + for (int i = k + (blockIdx.x * ROWS_PER_CTA); + (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) { + double dx = b_shared[i % (ROWS_PER_CTA + 1)]; + dx /= A[i * N_ROWS + i]; + + x_new[i] = (x_shared[i] + dx); + temp_sum += fabs(dx); + } + + for (int offset = tile8.size() / 2; offset > 0; offset /= 2) { + temp_sum += tile8.shfl_down(temp_sum, offset); + } + + if (tile8.thread_rank() == 0) { + atomicAdd(sum, temp_sum); + } + } +} + +// Thread block size for finalError kernel should be multiple of 32 +static __global__ void finalError(double const *x, double *g_sum) { + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ double warpSum[]; + double sum = 0.0; + + int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) { + double d = x[i] - 1.0; + sum += fabs(d); + } + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + sum += tile32.shfl_down(sum, offset); + } + + if (tile32.thread_rank() == 0) { + warpSum[threadIdx.x / warpSize] = sum; + } + + cg::sync(cta); + + double blockSum = 0.0; + if (threadIdx.x < (blockDim.x / warpSize)) { + blockSum = warpSum[threadIdx.x]; + } + + if (threadIdx.x < 32) { + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + blockSum += tile32.shfl_down(blockSum, offset); + } + if (tile32.thread_rank() == 0) { + atomicAdd(g_sum, blockSum); + } + } +} diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp new file mode 100644 index 00000000..9a8e5386 --- /dev/null +++ b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp @@ -0,0 +1,182 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Eyal Rozenberg + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// This sample demonstrates Instantiated CUDA Graph Update +// with Jacobi Iterative Method in 3 different methods: +// 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with +// cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() - +// CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph +// method + +// Jacobi method on a linear system A*x = b, +// where A is diagonally dominant and the exact solution consists +// of all ones. +// The dimension N_ROWS is included in jacobi.h + +#include +#include +#include +#include +#include +#include "jacobi.h" + +[[noreturn]] bool die_(const ::std::string& message = "") +{ + if (not message.empty()) { ::std::cerr << message << std::endl; } + exit(EXIT_FAILURE); +} + +// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 +// elsewhere. The elements of the right hand side b all equal 2*n, hence the +// exact solution x to A*x = b is a vector of ones. +void createLinearSystem(span A, span b) +{ + int i, j; + for (i = 0; i < N_ROWS; i++) { + b[i] = 2.0 * N_ROWS; + for (j = 0; j < N_ROWS; j++) A[i * N_ROWS + j] = 1.0; + A[i * N_ROWS + i] = N_ROWS + 1.0; + } +} + +// Run the Jacobi method for A*x = b on CPU. +std::pair do_jacobi_on_cpu(span A, span b, float convergence_threshold, int max_iterations) +{ + auto x_ = std::array{}; + span x = { x_.data(), N_ROWS }; + auto x_new = std::array { 0 }; + int k; + + for (k = 0; k < max_iterations; k++) { + double sum = 0.0; + for (int i = 0; i < N_ROWS; i++) { + double temp_dx = b[i]; + for (int j = 0; j < N_ROWS; j++) temp_dx -= A[i * N_ROWS + j] * x[j]; + temp_dx /= A[i * N_ROWS + i]; + x_new[i] += temp_dx; + sum += ::std::fabs(temp_dx); + } + std::copy(x_new.cbegin(), x_new.cend(), x.begin()); + + if (sum <= convergence_threshold) break; + } + + double sum = std::accumulate(std::begin(x), std::end(x), 0.0, + [](double accumulation_so_far, double element) { + return accumulation_so_far + ::std::fabs(element - 1.0); + } ); +// sdkStopTimer(&timerCPU); + + report_error_sum("CPU", k+1, sum); + std::cout << '\n'; + + return {k + 1, sum}; +} + +template +bool do_gpu_jacobi( + const cuda::device_t& device, + const cuda::stream_t& stream, + span A, + span b, + span d_A, + span d_b, + float convergence_threshold, + int max_iterations, + span d_x, + span d_x_new, + span d_sum, + double sum_on_cpu +) +{ + stream.enqueue.memzero(d_x); + stream.enqueue.memzero(d_x_new); + stream.enqueue.copy(d_A, A); + stream.enqueue.copy(d_b, b); + +// sdkCreateTimer(&timerGpu); +// sdkStartTimer(&timerGpu); + + std::cout << "Jacobi computation with method " << method_name(Method) << ":\n"; + double sum = do_jacobi_inner(device, stream, d_A, d_b, convergence_threshold, max_iterations, d_x, d_x_new, d_sum); + + bool success = ::std::fabs(sum_on_cpu - sum) < convergence_threshold; + std::cout << (success ? "PASSED" : "FAILED") << "\n\n"; + return success; +} + +void report_error_sum(const char* where, int num_iterations, double sum_on_cpu) +{ + std::cout << where << " iterations : " << num_iterations << '\n'; + auto cout_flags (std::cout.flags()); + std::cout << where << " error : " << std::setprecision(3) << std::scientific << sum_on_cpu << '\n'; + std::cout.setf(cout_flags); +} + +int main(int argc, char **argv) +{ + // Being very cavalier about our command-line arguments here... + cuda::device::id_t device_id = (argc > 1) ? ::std::stoi(argv[1]) : cuda::device::default_device_id; + auto device = cuda::device::get(device_id); + + auto b = cuda::memory::host::make_unique_span(N_ROWS); + auto A = cuda::memory::host::make_unique_span(N_ROWS * N_ROWS); + + createLinearSystem(A, b); + + float convergence_threshold = 1.0e-2; + int max_num_iterations = 4 * N_ROWS * N_ROWS; + + // create timer +// StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; +// sdkCreateTimer(&timerCPU); + +// sdkStartTimer(&timerCPU); + auto num_iterations_and_sum = do_jacobi_on_cpu(A, b, convergence_threshold, max_num_iterations); + auto num_iterations = num_iterations_and_sum.first; + auto sum_on_cpu = num_iterations_and_sum.second; + +// printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); + + auto d_A = cuda::memory::device::make_unique_span(device, N_ROWS * N_ROWS); + auto d_b = cuda::memory::device::make_unique_span(device, N_ROWS); + auto d_x = cuda::memory::device::make_unique_span(device, N_ROWS); + auto d_x_new = cuda::memory::device::make_unique_span(device, N_ROWS); + auto d_sum = cuda::memory::device::make_unique_span(device, 1); + auto stream = cuda::stream::create(device, cuda::stream::async); + + do_gpu_jacobi( + device, stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_(); + do_gpu_jacobi( + device, stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_(); + do_gpu_jacobi( + device, stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_(); + + std::cout << "SUCCESS\n"; +} + diff --git a/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu b/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu new file mode 100644 index 00000000..0715bddd --- /dev/null +++ b/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu @@ -0,0 +1,508 @@ +/* + * Original code Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Eyal Rozenberg + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#if __cplusplus >= 202001L +using span = std::span; +#else +using cuda::span; +#endif + +namespace cg = cooperative_groups; + +#define THREADS_PER_BLOCK 512 +#define GRAPH_LAUNCH_ITERATIONS 3 + +__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, size_t outputSize) +{ + __shared__ double tmp[THREADS_PER_BLOCK]; + + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double) inputVec[i]; + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + double beta = temp_sum; + double temp; + + for (int i = tile32.size() / 2; i > 0; i >>= 1) { + if (tile32.thread_rank() < i) { + temp = tmp[cta.thread_rank() + i]; + beta += temp; + tmp[cta.thread_rank()] = beta; + } + cg::sync(tile32); + } + cg::sync(cta); + + if (cta.thread_rank() == 0 && blockIdx.x < outputSize) { + beta = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) { + beta += tmp[i]; + } + outputVec[blockIdx.x] = beta; + } +} + +__global__ void reduceFinal(double *inputVec, double *result, size_t inputSize) +{ + __shared__ double tmp[THREADS_PER_BLOCK]; + + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double) inputVec[i]; + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // do reduction in shared mem + if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256]; + } + + cg::sync(cta); + + if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128]; + } + + cg::sync(cta); + + if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64]; + } + + cg::sync(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockDim.x >= 64) temp_sum += tmp[cta.thread_rank() + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + temp_sum += tile32.shfl_down(temp_sum, offset); + } + } + // write result for this block to global mem + if (cta.thread_rank() == 0) result[0] = temp_sum; +} + +void init_input(cuda::span a) { + auto generator = []() { return(rand() & 0xFF) / (float)RAND_MAX; }; + std::generate_n(a.data(), a.size(), generator); +} + +void myRealHostNodeCallback(char const *graph_construction_mode, double result) +{ + std::cout << "Host callback in graph constructed by " << graph_construction_mode << ": result = " << result << std::endl; + result = 0.0; // reset the result +} + +void CUDART_CB myHostNodeCallback(void *type_erased_data) +{ + auto *data = reinterpret_cast*>(type_erased_data); + auto graph_construction_mode = data->first; + auto result = data->second; + myRealHostNodeCallback(graph_construction_mode, *result); +} + +void report_attempt(const char* attempt_kind, const char* how_created) +{ + std::cout << '\n' + << "Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << '\n' + << "----------------------------------------------------------------\n"; +} + +void use(const cuda::device_t &device, const cuda::graph::template_t &graph, const char* how_created) +{ + report_attempt("use", how_created); + std::cout << "Number of graph nodes = " << graph.num_nodes() << '\n'; + + auto instance = cuda::graph::instantiate(graph); + + auto cloned = graph.clone(); + auto cloned_graph_instance = cuda::graph::instantiate(graph); + + auto stream_for_graph = cuda::stream::create(device, cuda::stream::async); + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + std::cout + << "Launching an instance of the original graph: launch " + << (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl; + instance.launch(stream_for_graph); + } + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + std::cout + << "Launching an instance of the cloned graph: launch " + << (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl; + cloned_graph_instance.launch(stream_for_graph); + } + std::cout << std::endl; + stream_for_graph.synchronize(); +} + +void cudaGraphsManual( + const cuda::device_t& device, + span inputVec_h, + span inputVec_d, + span outputVec_d, + span result_d) +{ + const char* graph_construction_mode = "explicit node and edge insertion calls"; + report_attempt("construction", graph_construction_mode); + double result_h = 0.0; + + using node_kind_t = cuda::graph::node::kind_t; + auto graph = cuda::graph::create(); + + auto memcpy_node = [&] { + cuda::memory::copy_parameters_t<3> copy_params; + // TODO: Have the copy_parameters_t class be more like a builder. + // And - accept sizes with dimensionality upto the copy params dimensionality + copy_params.set_source(inputVec_h); + copy_params.set_destination(inputVec_d); + // TODO: Need to tweak the copy parameters class so that these next few lines are not necessary; + // and to make sure we don't use the params without everything necessary being set + copy_params.set_extent(inputVec_h.size()); + copy_params.clear_offsets(); + copy_params.clear_rest(); + auto current_context = cuda::context::current::get(); + return graph.insert.node(current_context, copy_params); + }(); + + auto memset_node = [&] { + cuda::graph::node::parameters_t params; + params.value = 0; + params.width_in_bytes = 4; + params.region = outputVec_d; + return graph.insert.node(params); + }(); + + auto reduce_node = [&] { + auto reduce_kernel = cuda::kernel::get(device, reduce); + auto launch_config = cuda::launch_config_builder() + .grid_size(outputVec_d.size()) + .block_size(THREADS_PER_BLOCK) + .build(); + auto kernel_arg_pointers = cuda::graph::make_kernel_argument_pointers( + inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size()); + auto kernel_node_args = cuda::graph::make_launch_primed_kernel(reduce_kernel, launch_config, kernel_arg_pointers); + return graph.insert.node(kernel_node_args); + }(); + + graph.insert.edge(memcpy_node, reduce_node); + graph.insert.edge(memset_node, reduce_node); + + auto memset_result_node = [&] { + cuda::graph::node::parameters_t params; + params.value = 0; + params.width_in_bytes = 4; + params.region = result_d; + return graph.insert.node(params); + }(); + + auto reduce_final_node = [&] { + auto kernel = cuda::kernel::get(device, reduceFinal); + auto launch_config = cuda::launch_config_builder() + .grid_size(1) + .block_size(THREADS_PER_BLOCK) + .build(); + auto arg_ptrs = cuda::graph::make_kernel_argument_pointers(outputVec_d.data(), result_d.data(), outputVec_d.size()); + return graph.insert.node(kernel, launch_config, arg_ptrs); + }(); + + graph.insert.edge(reduce_node, reduce_final_node); + graph.insert.edge(memset_result_node, reduce_final_node); + + auto memcpy_result_node = [&] { + cuda::memory::copy_parameters_t<3> copy_params; + // TODO: Have the copy_parameters_t class be more like a builder. + // And - accept sizes with dimensionality upto the copy params dimensionality + copy_params.set_source(result_d); + copy_params.set_destination(&result_h, 1); + copy_params.set_extent(1); + copy_params.clear_offsets(); + copy_params.clear_rest(); + return graph.insert.node(copy_params); + }(); + + graph.insert.edge(reduce_final_node, memcpy_result_node); + + auto host_function_data = std::make_pair(graph_construction_mode, &result_h); + auto host_function_node = graph.insert.node(myHostNodeCallback, &host_function_data); + + graph.insert.edge(memcpy_result_node, host_function_node); + + use(device, graph, graph_construction_mode); +} + +void cudaGraphsManualWithBuilders( + const cuda::device_t& device, + span inputVec_h, + span inputVec_d, + span outputVec_d, + span result_d) +{ + const char* graph_construction_mode = "use of node builders and explicit edge insertions"; + report_attempt("construction", graph_construction_mode); + double result_h = 0.0; + + using node_kind_t = cuda::graph::node::kind_t; + auto graph = cuda::graph::create(); + + // what about building via the graph object? + cuda::graph::node::builder_t builder; + + std::cout << "Building a memcpy node" << std::endl; + + // TODO: Consider having builder_t::memory_copy , builder_t::memory_set etc. + auto memcpy_node = builder.kind() + .source(inputVec_h) + .destination(inputVec_d) + // the following is not necessary - it is deduced + // .extent(inputVec_h.size()) + // No need to clear anything - that's taken care of by default + // .copy_params.clear_offsets(); + // .copy_params.clear_rest(); + // No need to set the context - we default to the current context! + // .context(cuda::context::current::get()) + .build_within(graph); + + std::cout << "Building a memset node" << std::endl; + + auto memset_node = builder.kind() + .region(inputVec_d) + .value(0) + .build_within(graph); + + std::cout << "Building a kernel launch node" << std::endl; + + + auto wrapped_reduce_kernel = cuda::kernel::get(device, reduce); + auto reduce_node = [&]{ + auto launch_config = cuda::launch_config_builder() + .grid_size(outputVec_d.size()) + .block_size(THREADS_PER_BLOCK) + .build(); + + return builder.kind() + .kernel(wrapped_reduce_kernel) + .launch_configuration(launch_config) + .arguments(inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size()) + // Also need to support add_argument called multiple times + .build_within(graph); + }(); + + graph.insert.edge(memcpy_node, reduce_node); + graph.insert.edge(memset_node, reduce_node); + + std::cout << "Building a memset node" << std::endl; + + auto memset_result_node = builder.kind() + .region(result_d) + .value(0) + .build_within(graph); + + + std::cout << "Building a kernel launch node" << std::endl; + + auto final_reduce_launch_config = cuda::launch_config_builder() + .grid_size(1) + .block_size(THREADS_PER_BLOCK) + .build(); + + auto wrapped_reduce_final_kernel = cuda::kernel::get(device, reduceFinal); + auto reduce_final_node = builder.kind() + .kernel(wrapped_reduce_final_kernel) + .launch_configuration(final_reduce_launch_config) + .arguments(outputVec_d.data(), result_d.data(), outputVec_d.size()) + // Also need to support add_argument called multiple times + .build_within(graph); + + graph.insert.edge(reduce_node, reduce_final_node); + graph.insert.edge(memset_result_node, reduce_final_node); + + std::cout << "Building a memcpy node" << std::endl; + + auto memcpy_result_node = builder.kind() + .source(result_d) + .destination(cuda::span{&result_h,1}) + .build_within(graph); + + graph.insert.edge(reduce_final_node, memcpy_result_node); + + auto host_function_data = std::make_pair(graph_construction_mode, &result_h); + + std::cout << "Building a host function node" << std::endl; + + auto host_function_node = builder.kind() + .argument(&host_function_data) + .function(myHostNodeCallback) + .build_within(graph); + + graph.insert.edge(memcpy_result_node, host_function_node); + + use(device, graph, graph_construction_mode); +} + +void cudaGraphsUsingStreamCapture( + const cuda::device_t& device, + span inputVec_h, + span inputVec_d, + span outputVec_d, + span result_d) +{ + const char* graph_construction_mode = "stream capture"; + report_attempt("construction", graph_construction_mode); + double result_h = 0.0; + + using cuda::stream::async; + auto stream_1 = cuda::stream::create(device, async); + auto stream_2 = cuda::stream::create(device, async); + auto stream_3 = cuda::stream::create(device, async); + + auto fork_stream_event = cuda::event::create(device); + auto reduce_output_memset_event = cuda::event::create(device); + auto final_result_memset_event = cuda::event::create(device); + + stream_1.begin_capture(cuda::stream::capture::mode_t::global); + + stream_1.enqueue.event(fork_stream_event); + stream_2.enqueue.wait(fork_stream_event); + stream_3.enqueue.wait(fork_stream_event); + + stream_1.enqueue.copy(inputVec_d, inputVec_h); + stream_2.enqueue.memzero(outputVec_d); + + stream_2.enqueue.event(reduce_output_memset_event); + stream_3.enqueue.memzero(result_d); + stream_3.enqueue.event(final_result_memset_event); + + stream_1.enqueue.wait(reduce_output_memset_event); + + auto launch_config = cuda::launch_config_builder() + .grid_dimensions(outputVec_d.size()) + .block_dimensions(THREADS_PER_BLOCK) + .build(); + + stream_1.enqueue.kernel_launch(reduce, launch_config, + inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size()); + + stream_1.enqueue.wait(final_result_memset_event); + + launch_config = cuda::launch_config_builder() + .grid_dimensions(1) + .block_dimensions(THREADS_PER_BLOCK) + .build(); + stream_1.enqueue.kernel_launch(reduceFinal, launch_config, + outputVec_d.data(), result_d.data(), outputVec_d.size()); + + stream_1.enqueue.copy(&result_h, result_d); + + auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); }; + stream_1.enqueue.host_invokable(callback); + + auto graph = stream_1.end_capture(); + + use(device, graph, graph_construction_mode); +} + +[[noreturn]] bool die_(const ::std::string& message) +{ + ::std::cerr << message << "\n"; + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + size_t size { 1 << 24 }; // number of elements to reduce + size_t maxBlocks { 512 }; + + if (cuda::device::count() == 0) { + die_("No CUDA devices on this system"); + } + + // Being very cavalier about our command-line arguments here... + cuda::device::id_t device_id = (argc > 1) ? + ::std::stoi(argv[1]) : cuda::device::default_device_id; + + auto device = cuda::device::get(device_id); + + std::cout + << size << " elements\n" + << "threads per block = " << THREADS_PER_BLOCK << '\n' + << "Graph Launch iterations = " << GRAPH_LAUNCH_ITERATIONS << '\n' + << std::flush; + + auto inputVec_h = cuda::memory::host::make_unique_span(size); + auto inputVec_d = cuda::memory::device::make_unique_span(device, size); + auto outputVec_d = cuda::memory::device::make_unique_span(device, maxBlocks); + auto result_d = cuda::memory::device::make_unique_span(device, 1); + + init_input(inputVec_h); + + auto result_verification = ::std::accumulate( +#if __cplusplus >= 201712L + ::std::execution::par_unseq, +#endif + inputVec_h.begin(), inputVec_h.end(), 0.0); + std::cout << "Expected result = " << result_verification << '\n'; + + device.synchronize(); + + cudaGraphsManual(device, inputVec_h, inputVec_d, outputVec_d, result_d); + + cudaGraphsUsingStreamCapture(device, inputVec_h, inputVec_d, outputVec_d, result_d); + + device.synchronize(); + std::flush(std::cout); + + cudaGraphsManualWithBuilders(device, inputVec_h, inputVec_d, outputVec_d, result_d); + + std::cout << "\n\nSUCCESS\n"; +} diff --git a/examples/other/jitify/string_view.hpp b/examples/other/jitify/string_view.hpp index c0fa2b6e..bc1419d2 100644 --- a/examples/other/jitify/string_view.hpp +++ b/examples/other/jitify/string_view.hpp @@ -526,7 +526,7 @@ inline nssv_constexpr std::size_t length( CharT * s, std::size_t result = 0 ) template< typename CharT > inline nssv_constexpr14 std::size_t length( CharT * s ) { - std::size_t result = 0; + ::std::size_t result = 0; while ( *s++ != '\0' ) { ++result; @@ -570,7 +570,7 @@ class basic_string_view typedef const_pointer iterator; typedef const_pointer const_iterator; typedef std::reverse_iterator< const_iterator > reverse_iterator; - typedef std::reverse_iterator< const_iterator > const_reverse_iterator; + typedef ::std::reverse_iterator< const_iterator > const_reverse_iterator; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; @@ -1042,14 +1042,14 @@ class basic_string_view #else - std::basic_string + ::std::basic_string to_string() const { return std::basic_string( begin(), end() ); } template< class Allocator > - std::basic_string + ::std::basic_string to_string( Allocator const & a ) const { return std::basic_string( begin(), end(), a ); @@ -1129,12 +1129,12 @@ nssv_constexpr bool operator==( template< class CharT, class Traits> nssv_constexpr bool operator==( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; } template< class CharT, class Traits> nssv_constexpr bool operator==( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; } @@ -1155,12 +1155,12 @@ nssv_constexpr bool operator!=( template< class CharT, class Traits> nssv_constexpr bool operator!=( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return !( lhs == rhs ); } template< class CharT, class Traits> nssv_constexpr bool operator!=( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return !( lhs == rhs ); } @@ -1181,12 +1181,12 @@ nssv_constexpr bool operator<( template< class CharT, class Traits> nssv_constexpr bool operator<( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return lhs.compare( rhs ) < 0; } template< class CharT, class Traits> nssv_constexpr bool operator<( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return rhs.compare( lhs ) > 0; } @@ -1207,12 +1207,12 @@ nssv_constexpr bool operator<=( template< class CharT, class Traits> nssv_constexpr bool operator<=( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return lhs.compare( rhs ) <= 0; } template< class CharT, class Traits> nssv_constexpr bool operator<=( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return rhs.compare( lhs ) >= 0; } @@ -1233,12 +1233,12 @@ nssv_constexpr bool operator>( template< class CharT, class Traits> nssv_constexpr bool operator>( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return lhs.compare( rhs ) > 0; } template< class CharT, class Traits> nssv_constexpr bool operator>( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return rhs.compare( lhs ) < 0; } @@ -1259,12 +1259,12 @@ nssv_constexpr bool operator>=( template< class CharT, class Traits> nssv_constexpr bool operator>=( basic_string_view lhs, - std::basic_string rhs ) nssv_noexcept + ::std::basic_string rhs ) nssv_noexcept { return lhs.compare( rhs ) >= 0; } template< class CharT, class Traits> nssv_constexpr bool operator>=( - std::basic_string rhs, + ::std::basic_string rhs, basic_string_view lhs ) nssv_noexcept { return rhs.compare( lhs ) <= 0; } @@ -1414,7 +1414,7 @@ Stream & write_to_stream( Stream & os, View const & sv ) template< class CharT, class Traits > std::basic_ostream & operator<<( - std::basic_ostream& os, + ::std::basic_ostream& os, basic_string_view sv ) { return detail::write_to_stream( os, sv ); diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index 2024b799..51af11f0 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -56,6 +56,13 @@ #include "api/external.hpp" #endif // CUDA_VERSION >= 10000 +#if CUDA_VERSION >= 10000 +#include "api/graph/node.hpp" +#include "api/graph/typed_node.hpp" +#include "api/graph/template.hpp" +#include "api/graph/instance.hpp" +#endif // CUDA_VERSION >= 10000 + #include "api/multi_wrapper_impls/pointer.hpp" #include "api/multi_wrapper_impls/array.hpp" #include "api/multi_wrapper_impls/event.hpp" @@ -72,7 +79,9 @@ #include "api/multi_wrapper_impls/module.hpp" #include "api/multi_wrapper_impls/ipc.hpp" #include "api/multi_wrapper_impls/launch_configuration.hpp" - #include "api/launch_config_builder.hpp" - +#if CUDA_VERSION >= 10000 +#include "api/multi_wrapper_impls/graph.hpp" +#include "api/graph/node_builder.hpp" +#endif // CUDA_VERSION >= 10000 #endif // CUDA_API_WRAPPERS_HPP_ diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp index 350c877d..e744a354 100644 --- a/src/cuda/api/context.hpp +++ b/src/cuda/api/context.hpp @@ -260,6 +260,12 @@ class context_t { * `my_context.memory::foo()`, think of it as a `my_dev::memory::foo()`. */ class global_memory_type { +#if CUDA_VERSION >= 11000 + public: // data types + using execution_graph_related_attribute_t = CUgraphMem_attribute; +#endif + + protected: // data members const device::id_t device_id_; const context::handle_t context_handle_; @@ -325,6 +331,42 @@ class context_t { CAW_SET_SCOPE_CONTEXT(context_handle_); return context::detail_::free_memory(context_handle_); } + +#if CUDA_VERSION >= 11000 + /** + * @note CUDA devices use special memory allocation for execution graphs - not managed via + * cuMemAlloc and friends. These methods allow for controlling and querying some aspects of this + * memory. + */ + ///@{ + void free_unused_execution_graph_memory() const + { + auto status = cuDeviceGraphMemTrim(device_id_); + throw_if_error_lazy(status, + "Trimming memory used for CUDA execution graphs on " + device::detail_::identify(device_id_)); + } + + /** + * @param attribute See the documentation for @ref CUgraphMem_attribute for the possible attributes + */ + size_t get_execution_graph_related_attribute(execution_graph_related_attribute_t attribute) const + { + cuuint64_t result; + auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result); + throw_if_error_lazy(status, "Failed obtaining an execution-graph-related memory attribute for " + + device::detail_::identify(device_id_)); + return result; + } + + void reset_execution_graph_usage_high_watermark() const + { + cuuint64_t value_{0}; + auto status = cuDeviceSetGraphMemAttribute(device_id_, CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, &value_); + throw_if_error_lazy(status, "Failed setting an execution-graph-related memory attribute for " + + device::detail_::identify(device_id_)); + } + ///@} +#endif }; // class global_memory_type diff --git a/src/cuda/api/detail/for_each_argument.hpp b/src/cuda/api/detail/for_each_argument.hpp new file mode 100644 index 00000000..06b33a78 --- /dev/null +++ b/src/cuda/api/detail/for_each_argument.hpp @@ -0,0 +1,26 @@ +#ifndef CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP +#define CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP + +#include + +namespace cuda { + +namespace detail_ { + +template +void for_each_argument(F) { } + +template +void for_each_argument(F f, Args&&... args) { + using arrT = int[]; + static_cast(arrT{(f(::std::forward(args)), 0)...}); +// This: +// [](...){}((f(::std::forward(args)), 0)...); +// doesn't guarantee execution order +} + +} // namespaced detail_ + +} // namespaced cuda + +#endif //CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index e5a85038..68cae7ac 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -140,6 +140,74 @@ class device_t { using properties_t = device::properties_t; using attribute_value_t = device::attribute_value_t; +#if CUDA_VERSION >= 11000 + class global_memory_type : public context_t::global_memory_type { + + /** + * Obtains the amount of memory used for execution-graph-related resource on + * this device. + * + * @param reserved + * when false, the memory in actual use at the moment is returned; when true, + * the amount returned is the total reserved for use by the "asynchronous + * allocator" used for execution graphs + * @param high_watermark + * when false, the current amount of memory is returned; when true, the + * returned amount is the "high watermark" of device memory use on the device + * since the last reset. + */ + size_t amount_used_for_graphs( + bool reserved = false, + bool high_watermark = false) const + { + auto attribute = reserved ? + (high_watermark ? CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) : + (high_watermark ? CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_USED_MEM_HIGH); + size_t result; + auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result); + throw_if_error_lazy(status, + "Obtaining the current amount of memory used for execution graphs on " + + device::detail_::identify(device_id_)); + return result; + } + + /** + * Releases any memory allocated for execution graph resources back to the OS + */ + void free_unused_graph_memory() const + { + auto status = cuDeviceGraphMemTrim(device_id_); + throw_if_error_lazy(status, "Freeing unused execution graph memory on " + + device::detail_::identify(device_id_)); + } + + /** + * Obtains the amount of memory used for execution-graph-related resource on + * this device. + * + * @param high_watermark + * when false, and by default, the current amount of memory used is + * returned; when true, the returned amount is the "high watermark" of + * device memory usage for graph resources since the last reset. + */ + size_t amount_used_for_graphs(bool high_watermark = false) const + { + size_t result; + auto status = cuDeviceGetGraphMemAttribute( + device_id_, + high_watermark ? + CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : + CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, + &result); + throw_if_error_lazy(status, + "Obtaining the current amount of memory used for execution graphs on " + + device::detail_::identify(device_id_)); + return result; + } + + }; +#endif // CUDA_VERSION >= 11000 + /** * * @note The memory proxy regards the device's primary context. diff --git a/src/cuda/api/external.hpp b/src/cuda/api/external.hpp index db5dfc9a..d4173f51 100644 --- a/src/cuda/api/external.hpp +++ b/src/cuda/api/external.hpp @@ -9,6 +9,8 @@ #ifndef CUDA_API_WRAPPERS_EXTERNAL_HPP_ #define CUDA_API_WRAPPERS_EXTERNAL_HPP_ +#if CUDA_VERSION >= 10000 + #include "memory.hpp" #include "unique_region.hpp" @@ -187,4 +189,6 @@ inline unique_region map(const resource_t& resource) } // namespace memory } // namespace cuda +#endif // CUDA_VERSION >= 10000 + #endif // CUDA_API_WRAPPERS_EXTERNAL_HPP_ diff --git a/src/cuda/api/graph/identify.hpp b/src/cuda/api/graph/identify.hpp new file mode 100644 index 00000000..73290aad --- /dev/null +++ b/src/cuda/api/graph/identify.hpp @@ -0,0 +1,100 @@ +/** + * @file + * + */ +#pragma once +#ifndef CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_ +#define CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_ + +#if CUDA_VERSION >= 10000 + +#include "../error.hpp" +#include "../types.hpp" + +namespace cuda { + +namespace graph { + +namespace template_ { + +namespace detail_ { + +inline ::std::string identify(handle_t handle) +{ + return "execution graph template " + cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(handle_t handle, device::id_t device_id) +{ + return identify(handle) + " on " + device::detail_::identify(device_id); +} +/* + +inline ::std::string identify(handle_t handle, context::handle_t context_handle) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle); +} + +inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle, device_id); +} +*/ + +} // namespace detail_ + +} // namespace template_ + +namespace instance { + +namespace detail_ { + +inline ::std::string identify(handle_t handle) +{ + return "execution graph instance " + cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(handle_t handle, device::id_t device_id) +{ + return identify(handle) + " on " + device::detail_::identify(device_id); +} + +inline ::std::string identify(handle_t handle, context::handle_t context_handle) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle); +} + +inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id) +{ + return identify(handle) + " on " + context::detail_::identify(context_handle, device_id); +} + +} // namespace detail_ + +} // namespace instance + +namespace node { + +namespace detail_ { + +inline ::std::string identify(handle_t handle) +{ + return ::std::string("node with handle ") + ::cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(handle_t node_handle, template_::handle_t graph_template_handle) +{ + return identify(node_handle) + " in " + template_::detail_::identify(graph_template_handle); +} + +} // namespace detail_ + +} // namespace node + +} // namespace graph + +} // namespace cuda + +#endif // CUDA_VERSION >= 10000 + +#endif // CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_ diff --git a/src/cuda/api/graph/instance.hpp b/src/cuda/api/graph/instance.hpp new file mode 100644 index 00000000..fe8179c3 --- /dev/null +++ b/src/cuda/api/graph/instance.hpp @@ -0,0 +1,504 @@ +/** + * @file + * + * @brief A CUDA execution graph instance wrapper class and some + * associated definitions. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_INSTANCE_HPP +#define CUDA_API_WRAPPERS_INSTANCE_HPP + +#if CUDA_VERSION >= 10000 + +#include "node.hpp" +#include "identify.hpp" +#include "../types.hpp" + +namespace cuda { + +class stream_t; + +namespace graph { + +class instance_t; + +///@endcond +class template_t; +///@endcond + +namespace instance { + +using update_status_t = CUgraphExecUpdateResult; + +namespace update_status { + +enum named_t : ::std::underlying_type::type { + success = CU_GRAPH_EXEC_UPDATE_SUCCESS, + failure_for_unexpected_reason = CU_GRAPH_EXEC_UPDATE_ERROR, + topology_has_changed = CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED, + node_type_has_changed = CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED, + kernel_node_function_has_changed = CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED, + unsupported_kind_of_parameter_change = CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED, + unsupported_aspect_of_node = CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED, +#if CUDA_VERSION >= 11000 + unsupported_kind_of_kernel_node_function_change = CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE, + unsupported_kind_of_node_attributes_change = CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED, +#endif // CUDA_VERSION >= 11000 +}; + +constexpr inline bool operator==(const update_status_t &lhs, const named_t &rhs) noexcept { return lhs == static_cast(rhs); } +constexpr inline bool operator!=(const update_status_t &lhs, const named_t &rhs) noexcept { return lhs != static_cast(rhs); } +constexpr inline bool operator==(const named_t &lhs, const update_status_t &rhs) noexcept { return static_cast(lhs) == rhs; } +constexpr inline bool operator!=(const named_t &lhs, const update_status_t &rhs) noexcept { return static_cast(lhs) != rhs; } + +namespace detail_ { + +const char *const descriptions[] = { + "success", + "failure for an unexpected reason described in the return value of the function", + "topology has changed", + "node type has changed", + "kernel node function has changed", + "parameters changed in an unsupported way", + "something about the node is not supported", + "unsupported kind of kernel node function change", + "unsupported kind of node attributes change" +}; + +inline bool is_node_specific(update_status_t update_status) +{ + return + (update_status != success) and + (update_status != failure_for_unexpected_reason) and + (update_status != topology_has_changed) and + (update_status != unsupported_kind_of_parameter_change); +} + +} // namespace detail_ + +} // namespace update_status + +namespace detail_ { + +using flags_t = cuuint64_t; + +inline const char *describe(instance::update_status_t update_status) +{ + return instance::update_status::detail_::descriptions[update_status]; +} + +inline ::std::string describe( + instance::update_status_t update_status, + node::handle_t node_handle, + template_::handle_t graph_template_handle); + +} // namespace detail_ + +} // namespace instance + + +/** + * @brief enqueues the execution of a execution-graph instance via a stream + * + * @note recall that graph execution is not serialized on the stream, i.e. one could say that the execution is not + * restricted to that single stream; the relation to the stream is that no graph node will be executed before the + * currently-enqueued work is concluded, and that no further work on the stream will proceed until the graph is fully + * executed. + * + * @note Only one execution graph instance may be executing at a time. One can, however, execute the same graph + * concurrently multiple times, but creating multiple instances of the same graph template (@ref template_t). + * It is possible to execute/launch the same instance multiple times _sequentially_. + * + * @note Take care to have the graph either free any allocations it makes, or use a graph instantiated with + * @ref CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH - or the (second) launch of the instance will fail. + * + * @note An execution graph instance must be "uploaded" to the GPU before it is executed - i.e. have + * appropriate physical device resources allocated for it. If this is not done on instantiation, nor + * explicitly with @ref instance::upload, calling this function will first upload it, then schedule + * it. + */ + void launch(const instance_t& instance, const stream_t& stream); + +} // namespace graph + +/** + * @brief Determine whether the API call returning the specified status had succeeded + */ +inline ::std::string describe(graph::instance::update_status_t status) +{ + return graph::instance::detail_::describe(status); +} + +::std::string describe(graph::instance::update_status_t update_status, optional node); + +/** + * @brief Determine whether the API call returning the specified status had succeeded + */ +inline constexpr bool is_success(graph::instance::update_status_t status) +{ + return status == graph::instance::update_status::success; +} + +/** + * @brief Determine whether the API call returning the specified status had failed + */ +constexpr bool is_failure(graph::instance::update_status_t status) { return not is_success(status); } + +namespace graph { + +namespace instance { + +instance_t wrap(template_::handle_t template_handle, handle_t handle, bool is_owning) noexcept; + +namespace detail_ { + +::std::string identify(const instance_t &instance); + +} // namespace detail_ + +// TODO: Add support for reporting errors involving edges +class update_failure : public ::std::runtime_error { +public: + using parent = ::std::runtime_error; + + update_failure( + update_status_t kind, + optional&& impermissible_node, + ::std::string&& what_arg) noexcept + : + parent((what_arg.empty() ? "" : what_arg + ": ") + describe(kind, impermissible_node)), + kind_(kind), + impermissible_node_(std::move(impermissible_node)) + { + // TODO: Ensure the kind needs a node handle IFF a node handle has been provided + } + + update_failure(update_status_t kind, node_t impermissible_node) noexcept + : update_failure(kind, optional(std::move(impermissible_node)), "") + { } + + update_status_t kind() const noexcept { return kind_; } + node_t impermissible_node() const { return impermissible_node_.value(); } + +private: + update_status_t kind_; + optional impermissible_node_; +}; + +/** + * Update the nodes of an execution graph instance with the settings in the nodes + * of a compatible execution graph template. + * + * @param destination An execution graph instance whose node settings are to be updated + * @param source An execution graph template which is either the one from which @p destination + * was instantiated, or one that is "topologically identical" to the instance, i.e. has the + * same types of nodes with the same edges (and no others). + */ +void update(const instance_t& destination, const template_t& source); + +namespace detail_ { + +template +status_t set_node_parameters_nothrow( + const instance::handle_t instance_handle, + const node::handle_t node_handle, + const typename node::detail_::kind_traits::raw_parameters_type raw_params) +{ + auto raw_params_maybe_ptr = node::detail_::maybe_add_ptr(raw_params); + return node::detail_::kind_traits::instance_setter(instance_handle, node_handle, raw_params_maybe_ptr); +} + +} // namespace detail_ + + +template +void set_node_parameters( + const instance_t& instance, + const node_t& node, + const node::parameters_t parameters); + +} // namespace instance + +namespace detail_ { + +inline void launch_graph_in_current_context(stream::handle_t stream_handle, instance::handle_t graph_instance_handle) +{ + auto status = cuGraphLaunch(graph_instance_handle, stream_handle); + throw_if_error_lazy(status, "Trying to launch " + + instance::detail_::identify(graph_instance_handle) + " on " + stream::detail_::identify(stream_handle)); +} + +inline void launch(context::handle_t context_handle, stream::handle_t stream_handle, instance::handle_t graph_instance_handle) +{ + context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle); + launch_graph_in_current_context(stream_handle, graph_instance_handle); +} + +} // namespace detail_ + +class instance_t { +public: // data types + using handle_type = instance::handle_t; + +public: // getters + template_::handle_t template_handle() const noexcept { return template_handle_; } + handle_type handle() const noexcept { return handle_; } + bool is_owning() const noexcept { return owning_; } + +protected: // constructors + instance_t(template_::handle_t template_handle, handle_type handle, bool owning) noexcept + : template_handle_(template_handle), handle_(handle), owning_(owning) + { } + +public: // constructors & destructor + instance_t(const instance_t& other) noexcept = delete; + + instance_t(instance_t&& other) noexcept : instance_t(other.template_handle_, other.handle_, other.owning_) + { + other.owning_ = false; + } + ~instance_t() + { + if (owning_) cuGraphExecDestroy(handle_); + } + +public: // operators + instance_t& operator=(const instance_t&) = delete; + instance_t& operator=(instance_t&& other) noexcept + { + ::std::swap(template_handle_, other.template_handle_); + ::std::swap(handle_, other.handle_); + ::std::swap(owning_, other.owning_); + return *this; + } + + +public: // friends + friend instance_t instance::wrap(template_::handle_t template_handle, handle_type handle, bool is_owning) noexcept; + +public: // non-mutators + void update(const template_t& update_source) const + { + instance::update(*this, update_source); + } + + void launch(const stream_t& stream) const + { + graph::launch(*this, stream); + } + void upload(const stream_t& stream) const; + + +/* bool free_previous_allocations_before_relaunch +#if CUDA_VERSION >= 12000 + , bool upload_on_instantiation, + , bool make_device_launchable, +#endif // CUDA_VERSION >= 12000 +#if CUDA_VERSION >= 11700 + , bool use_per_node_priorities +#endif*/ + +#if CUDA_VERSION >= 12000 + bool frees_allocations_before_relaunch() const + { + instance::detail_::flags_t flags; + auto status = cuGraphExecGetFlags (handle_, &flags); + throw_if_error_lazy(status, "Obtaining execution graph instance flags"); + return flags & CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH; + } + + bool uses_node_priorities() const + { + instance::detail_::flags_t flags; + auto status = cuGraphExecGetFlags (handle_, &flags); + throw_if_error_lazy(status, "Obtaining execution graph instance flags"); + return flags & CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY; + } + +#endif + + template + void set_node_parameters(const node_t& node, node::parameters_t new_parameters) + { + instance::set_node_parameters(*this, node, std::move(new_parameters)); + } + + template + void set_node_parameters(const node::typed_node_t& node) + { + instance::set_node_parameters(*this, node); + } + +protected: + template_::handle_t template_handle_; + handle_type handle_; + bool owning_; +}; + +/** + * @brief Have a GPU reserve resource and maintain a "copy" of an execution graph instance, allowing + * it to actually be schedule for execution. + * + * @note The upload can happen in one of three forms: + * + * 1. on creating of the graph instance (@ref instance_t); + * 2. Explicitly with this function + * 3. Implicitly, if @ref launch is called before the graph instance was otherwise uploaded - + * before the actual scheduling of the instance for execution via the stream + * + */ +void upload(const instance_t& instance, const stream_t& stream); + +namespace instance { + +inline instance_t wrap(template_::handle_t template_handle, handle_t handle, bool is_owning) noexcept +{ + return instance_t{template_handle, handle, is_owning}; +} + +enum : bool { + do_free_previous_allocations_before_relaunch = true, + auto_free = true, + dont_free_previous_allocations_before_relaunch = false, + no_auto_free = false, +#if CUDA_VERSION >= 12000 + + do_upload_on_instantiation = true, + dont_upload_on_instantiation = false, + auto_upload = true, + no_auto_upload = false, + manual_upload = false, + + make_launchable_from_device_code = true, + dont_make_launchable_from_device_code = true, + do_make_device_launchable = true, + dont_make_device_launchable = false, +#endif // CUDA_VERSION >= 12000 +#if CUDA_VERSION >= 11700 + + do_use_per_node_priorities = true, + do_use_per_node_priority = true, + dont_use_per_node_priorities = false, + dont_use_per_node_priority = true, + use_stream_priority = false +#endif // CUDA_VERSION >= 11700 +}; + +namespace detail_ { + +#if CUDA_VERSION >= 11000 +inline flags_t build_flags( + bool free_previous_allocations_before_relaunch +#if CUDA_VERSION >= 12000 + , bool upload_on_instantiation + , bool make_device_launchable +#endif // CUDA_VERSION >= 12000 +#if CUDA_VERSION >= 11700 + , bool use_per_node_priorities +#endif + ) +{ + return + (free_previous_allocations_before_relaunch ? CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH : 0) +#if CUDA_VERSION >= 12000 + | (upload_on_instantiation ? CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD : 0) + | (make_device_launchable ? CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH : 0) +#endif +#if CUDA_VERSION >= 11700 + | (use_per_node_priorities ? CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY : 0) +#endif + ; +} +#endif // CUDA_VERSION >= 11000 + +inline ::std::string identify(const instance_t& instance) +{ + return identify(instance.handle()) + " instantiated from " + + template_::detail_::identify(instance.template_handle()); +} + +inline ::std::string identify(const instance_t& instance, const template_t& template_) +{ + return identify(instance.handle()) + " instantiated from " + + template_::detail_::identify(template_); +} + + + +} // namespace detail_ + +template +void set_node_parameters( + const instance_t& instance, + const node_t& node, + node::parameters_t parameters) +{ + auto status = detail_::set_node_parameters_nothrow( + instance.handle(), node.handle(), node::detail_::kind_traits::marshal(parameters)); + throw_if_error_lazy(status, "Setting parameters of " + node::detail_::identify(node) + + " in " + instance::detail_::identify(instance)); +} + + +template +void set_node_parameters( + const instance_t& instance, + const node::typed_node_t& node_with_new_params) +{ + return set_node_parameters( + instance, static_cast(node_with_new_params), node_with_new_params.parameters()); +} + + +} // namespace instance + +inline instance_t instantiate( + const template_t& template_ +#if CUDA_VERSION >= 11000 + , bool free_previous_allocations_before_relaunch = false +#endif +#if CUDA_VERSION >= 12000 + , bool upload_on_instantiation = false + , bool make_device_launchable = false +#endif +#if CUDA_VERSION >= 11700 + , bool use_per_node_priorities = false +#endif +) +{ +#if CUDA_VERSION >= 11000 + instance::detail_::flags_t flags = instance::detail_::build_flags( + free_previous_allocations_before_relaunch +#if CUDA_VERSION >= 12000 + , upload_on_instantiation, make_device_launchable +#endif +#if CUDA_VERSION >= 11700 + , use_per_node_priorities +#endif + ); +#endif // CUDA_VERSION >= 11000 + instance::handle_t instance_handle; +#if CUDA_VERSION >= 11000 + auto status = cuGraphInstantiateWithFlags(&instance_handle, template_.handle(), static_cast(flags)); + throw_if_error_lazy(status, "Instantiating " + template_::detail_::identify(template_) ); +#else + static constexpr const size_t log_buffer_size { 2048 }; + dynarray log_buffer(log_buffer_size); + node::handle_t error_node; + auto status = cuGraphInstantiate(&instance_handle, template_.handle(), &error_node, log_buffer.data(), log_buffer_size); + throw_if_error_lazy(status, "Instantiating " + template_::detail_::identify(template_) + ": error at " + + node::detail_::identify(error_node) + " ; log buffer contents:\n" + log_buffer.data()); +#endif // CUDA_VERSION >= 11000 + static constexpr const bool is_owning { true }; + return instance::wrap(template_.handle(), instance_handle, is_owning); +} + +void launch(const cuda::stream_t& stream, const instance_t& instance); + +} // namespace graph + +} // namespace cuda + +#endif // CUDA_VERSION >= 10000 + +#endif //CUDA_API_WRAPPERS_INSTANCE_HPP diff --git a/src/cuda/api/graph/node.hpp b/src/cuda/api/graph/node.hpp new file mode 100644 index 00000000..a5cc3501 --- /dev/null +++ b/src/cuda/api/graph/node.hpp @@ -0,0 +1,152 @@ +/** + * @file + * + * @brief Graph template node proxy (base-)class base-class @ref node_t and supporting code. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_NODE_HPP +#define CUDA_API_WRAPPERS_NODE_HPP + +#if CUDA_VERSION >= 10000 + +#include "../types.hpp" + +namespace cuda { + +namespace graph { + +///@endcond +class node_t; +class template_t; +class instance_t; +///@endcond + +namespace node { + +node_t wrap(template_::handle_t graph_handle, handle_t handle) noexcept; + +namespace detail_ { + +::std::string identify(const node_t &node); + +} // namespace detail_ + +using type_t = CUgraphNodeType; + +} // namespace node + +/** + * @brief Wrapper class for a CUDA execution graph node + * + * Use this class to pass and receive nodes from/to other CUDA API wrapper functions + * and objects (in particular, `cuda::graph::template_t` and `cuda::graph::instance_t`). + * + * @note { This is a reference-type; it does not own the node, and will not remove + * the node from its graph-template when destroyed. You may therefore safely make copies + * of it. } + * + * @note { A node is always tied to a specific graph-template; it cannot be added to + * another graph-template or used independently. } + */ +class node_t { +public: // data types + using handle_type = node::handle_t; + using dependencies_type = ::std::vector; + using dependents_type = ::std::vector; + using type_type = node::type_t; + using size_type = size_t; + +// TODO: WRITEME +public: + handle_type handle() const noexcept { return handle_; } + template_t containing_graph() const noexcept; + template_::handle_t containing_graph_handle() const noexcept { return graph_template_handle_; } + type_type type_() const + { + type_type result; + auto status = cuGraphNodeGetType(handle_, &result); + throw_if_error_lazy(status, "Obtaining the type of " + node::detail_::identify(*this)); + return result; + } + + size_t num_dependencies() const + { + size_t num_dependencies_; + auto status = cuGraphNodeGetDependencies(handle_, nullptr, &num_dependencies_); + throw_if_error_lazy(status, "Obtaining the number of nodes on which " + node::detail_::identify(*this) + " is dependent"); + return num_dependencies_; + } + + size_t num_dependents() const + { + size_t num_dependents_; + auto status = cuGraphNodeGetDependentNodes(handle_, nullptr, &num_dependents_); + throw_if_error_lazy(status, "Obtaining the number of nodes dependent on " + node::detail_::identify(*this)); + return num_dependents_; + } + + dependencies_type dependencies() const + { + size_type num_dependencies_ {num_dependencies() } ; + ::std::vector node_handles {num_dependencies_ }; + auto status = cuGraphNodeGetDependencies(handle_, node_handles.data(), &num_dependencies_); + throw_if_error_lazy(status, "Obtaining the set nodes on which " + node::detail_::identify(*this) + " is dependent"); + dependencies_type result; + for (const auto& node_handle : node_handles) { + result.emplace_back(node::wrap(graph_template_handle_, node_handle)); + } + return result; + } + + dependencies_type dependents() const + { + size_type num_dependents_ { num_dependents() } ; + ::std::vector node_handles {num_dependents_ }; + auto status = cuGraphNodeGetDependentNodes(handle_, node_handles.data(), &num_dependents_); + throw_if_error_lazy(status, "Obtaining the set nodes dependent on " + node::detail_::identify(*this)); + dependencies_type result; + for (const auto& node_handle : node_handles) { + result.emplace_back(node::wrap(graph_template_handle_, node_handle)); + } + return result; + } + +protected: // constructors and destructors + node_t(template_::handle_t graph_template_handle, handle_type handle) noexcept + : graph_template_handle_(graph_template_handle), handle_(handle) { } + +public: // friendship + friend node_t node::wrap(template_::handle_t graph_handle, node::handle_t handle) noexcept; + +public: // constructors and destructors + node_t(const node_t&) noexcept = default; // It's a reference type, so copying is not a problem + node_t(node_t&&) noexcept = default; // It's a reference type, so copying is not a problem + + node_t& operator=(node_t other) noexcept + { + graph_template_handle_ = other.graph_template_handle_; + handle_ = other.handle_; + return *this; + } + +protected: + template_::handle_t graph_template_handle_; + handle_type handle_; +}; + +namespace node { + +inline node_t wrap(template_::handle_t graph_handle, handle_t handle) noexcept +{ + return { graph_handle, handle }; +} + +} // namespace node + +} // namespace graph + +} // namespace cuda + +#endif // CUDA_VERSION >= 10000 + +#endif //CUDA_API_WRAPPERS_NODE_HPP diff --git a/src/cuda/api/graph/node_builder.hpp b/src/cuda/api/graph/node_builder.hpp new file mode 100644 index 00000000..58f3ca19 --- /dev/null +++ b/src/cuda/api/graph/node_builder.hpp @@ -0,0 +1,606 @@ +/** + * @file + * + * @brief Convenience classes for construction execution graph nodes + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_NODE_BUILDER_HPP +#define CUDA_API_WRAPPERS_NODE_BUILDER_HPP + +#if CUDA_VERSION >= 10000 + +#include "typed_node.hpp" + +namespace cuda { + +namespace graph { + +namespace node { + +namespace detail_ { + +inline ::std::logic_error make_unspec_error(const char *node_type, const char *missing_arg_name) +{ + // Yes, returning it, not throwing it. This is an exception builder function + return ::std::logic_error( + ::std::string("Attempt to build a CUDA execution graph node of type ") + node_type + + " without specifying its " + missing_arg_name + " argument"); +} + +} // namespace detail_ + +template +class typed_builder_t; + +class builder_t +{ +public: + template + typed_builder_t kind() { return typed_builder_t{}; } +}; + +// Note: Can't build empty vertices for now + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::child_graph; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + + struct { + bool template_ { false }; + } was_set; // Yes, this is an ugly alternative to using optionals + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + params_type& params() noexcept { return params_; } + + this_type& template_(template_t subgraph) { + return do_([&] { + params_ = std::move(subgraph); + was_set.template_ = true; + }); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.template_) { + throw detail_::make_unspec_error("child graph", "child graph template"); + } + return graph_template.insert.node(std::move(params_)); + } +}; // typed_builder_t + +#if CUDA_VERSION >= 11000 + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::record_event; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + + struct { + bool event { false }; + } was_set; // Yes, this is an ugly alternative to using optionals + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + params_type& params() noexcept { return params_; } + + this_type& event(event_t event) { + return do_([&] { + params_ = std::move(event); + was_set.event = true; + }); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.event) { + throw detail_::make_unspec_error("record event", "event"); + } + return graph_template.insert.node(std::move(params_)); + } +}; // typed_builder_t + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::wait_on_event; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + + struct { + bool event { false }; + } was_set; // Yes, this is an ugly alternative to using optionals + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + params_type& params() noexcept { return params_; } + + this_type& event(event_t event) { + return do_([&] { + params_ = std::move(event); + was_set.event = true; + }); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.event) { + throw detail_::make_unspec_error("wait on event", "event"); + } + return graph_template.insert.node(std::move(params_)); + } +}; // typed_builder_t + +#endif // CUDA_VERSION >= 11000 + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::host_function_call; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + + struct { + bool function_ptr_set { false }; + bool user_argument_set {false }; + } was_set; // Yes, this is an ugly alternative to using optionals + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + params_type& params() noexcept { return params_; } + + this_type function(stream::callback_t host_callback_function) + { + return do_([&] { + params_.function_ptr = host_callback_function; + was_set.function_ptr_set = true; + }); + } + + this_type argument(void* callback_argument) + { + return do_([&] { + params_.user_data = callback_argument; + was_set.user_argument_set = true; + }); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.function_ptr_set) { + throw detail_::make_unspec_error("kernel_launch", "host callback function pointer"); + } + if (not was_set.user_argument_set) { + throw detail_::make_unspec_error("kernel_launch", "user-specified callback function argument"); + } + return graph_template.insert.node(params_); + } +}; // typed_builder_t + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::kernel_launch; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_ { + kernel_t { kernel::wrap(cuda::device::id_t(0), nullptr, nullptr) }, + { 0, 0 }, + { } + }; // An ugly way of constructing with invalid junk; see `was_set` below. We could + // have possibly used some kind of optional + + struct { + bool kernel { false }; + bool launch_config { false }; + bool marshalled_arguments { false }; + } was_set; // Yes, this is an ugly alternative to using optionals; but - have + // you ever looked at the implementation of optional?... + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + params_type& params() noexcept { return params_; } + + this_type kernel(const kernel_t& kernel) + { + return do_([&] { + // we can't just make an assignment to the `kernel` field, we have to reassign + // the whole structure... + params_ = { kernel, params_.launch_config, std::move(params_.marshalled_arguments) }; + was_set.kernel = true; + }); + } + + // Note: There is _no_ member for passing an apriori compiled kernel + // function and a device, since that would either mean leaking a primary context ref unit, + // or actually holding on to one in this class, which doesn't make sense. The graph template + // can't hold a ref unit... + + this_type launch_configuration(launch_configuration_t launch_config) + { + return do_([&] { + params_.launch_config = launch_config; + was_set.launch_config = true; + }); + } + + this_type marshalled_arguments(::std::vector argument_ptrs) + { + return do_([&] { + params_.marshalled_arguments = std::move(argument_ptrs); + was_set.marshalled_arguments = true; + }); + } + + template + this_type arguments(Ts&&... args) + { + return marshalled_arguments(make_kernel_argument_pointers(::std::forward(args)...)); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.kernel) { + throw detail_::make_unspec_error("kernel_launch", "kernel"); + } + if (not was_set.launch_config) { + throw detail_::make_unspec_error("kernel_launch", "launch configuration"); + } + if (not was_set.marshalled_arguments) { + throw detail_::make_unspec_error("kernel_launch", "launch arguments"); + } + return graph_template.insert.node(params_); + } +}; // typed_builder_t + +#if CUDA_VERSION >= 11000 + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::memory_allocation; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + using endpoint_t = cuda::memory::endpoint_t; + +protected: + params_type params_; + + struct { + bool device { false }; + bool size_in_bytes {false }; + } was_set; // Yes, this is an ugly alternative to using optionals + + template + this_type& do_(F f) + { + f(); + return *this; + } + +public: + params_type& params() { return params_; } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.device) { + throw detail_::make_unspec_error("memory allocation", "device"); + } + if (not was_set.size_in_bytes) { + throw detail_::make_unspec_error("memory allocation", "allocation size in bytes"); + } + return graph_template.insert.node(params_); + } + + this_type& device(const device_t& device) { + return do_([&]{ params_.first = device; was_set.device = true; }); + } + this_type& size(const size_t size) { + return do_([&]{ params_.second = size; was_set.size_in_bytes = true; }); + } +}; // typed_builder_t + +#endif // CUDA_VERSION >= 11000 + +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::memory_copy; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + using dimensions_type = params_type::dimensions_type; + using endpoint_t = cuda::memory::endpoint_t; +// static constexpr const dimensionality_t num_dimensions = traits::num_dimensions; + + +protected: + params_type params_; + + template + this_type& do_(F f) + { + f(); + return *this; + } + +public: + params_type& params() { return params_; } +// built_type build(); + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + // TODO: What about the extent???!!! + return graph_template.insert.node(params_); + } + +// this_type& context(endpoint_t endpoint, const context_t& context) noexcept +// { +// do_([&] { params_.set_context(endpoint, context); } ); +// } +// +// this_type& single_context(const context_t& context) noexcept +// { +// do_([&] { params_.set_single_context(context); } ); +// } + + // Note: This next variadic method should not be necessary considering + // the one right after it which uses the forwarding idiom; and yet - if we + // only keep the forwarding-source-method, we get errors. +// template +// this_type& source(const Ts&... args) { +// return do_([&]{ params_.set_source(args...); }); +// } + + template + this_type& source(Ts&&... args) { + return do_([&]{ params_.set_source(std::forward(args)...); }); + } +// +// template +// this_type& destination(const Ts&... args) { +// return do_([&]{ params_.set_destination(args...); }); +// } + + template + this_type& destination(Ts&&... args) { + return do_([&]{ params_.set_destination(std::forward(args)...); }); + } + + template + this_type& endpoint(endpoint_t endpoint, Ts&&... args) { + return do_([&]{ params_.set_endpoint(endpoint, std::forward(args)...); }); + } + +// this_type& source_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept +// { +// return do_([&] { params_.set_endpoint_untyped(endpoint_t::source, context_handle, ptr, dimensions); } ); +// } +// +// this_type& destination_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept +// { +// return do_([&] { params_.set_destination_untyped(context_handle, ptr, dimensions); } ); +// } +// +// this_type& endpoint_untyped(endpoint_t endpoint, context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept +// { +// return do_([&] { params_.set_endpoint_untyped(endpoint_t::source, context_handle, ptr, dimensions); } ); +// } + + // TODO: Need a proper builder for copy parameters; otherwise we'll need to implement one here, when it's + // already half-implemented there... it will need: + // 1. To sort out context stuff (already done in the copy parameters, but requires explicit setting atm + // 2. deduce extent when none specified + // 3. prevent direct manipulation of the parameters (which is currently allowed), so that we can apply logic + // such as "has the extent been set?" etc. + // 4. set defaults when relevant, e.g. w.r.t. pitches and such +}; // typed_builder_t + +template <> +class typed_builder_t { + // Note: Unlike memory_copy, for which the underlying parameter type, CUDA_MEMCPY3D_PEER, is also used + // in non-graph context - here the only builder functionality is for graph vertex construction; so we don't + // do any forwarding to a rich parameters class or its own builder. +public: + static constexpr const kind_t kind = kind_t::memory_set; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + struct { + bool region { false }; + bool value_and_width { false }; + } was_set; + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + + template + void set_width() { + } + +public: + const params_type& params() { return params_; } + + this_type region(memory::region_t region) noexcept + { + return do_([&] { params_.region = region; was_set.region = true;}); + } + + template + this_type value(uint32_t v) noexcept + { + static_assert(sizeof(T) <= 4, "Type of value to set is too wide; maximum size is 4"); + static_assert(sizeof(T) != 3, "Size of type to set is not a power of 2"); + static_assert(std::is_trivially_copy_constructible::value, "Only a trivially-constructible value can be used for memset'ing"); + return do_([&] { + params_.width_in_bytes = sizeof(T); + switch(sizeof(T)) { + // TODO: Maybe we should use uint_t template? Maybe use if constexpr with C++17? + case 1: params_.value = reinterpret_cast(v); break; + case 2: params_.value = reinterpret_cast(v); break; + case 4: + default: params_.value = reinterpret_cast(v); break; + } + was_set.value_and_width = true; + }); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.region) { + throw detail_::make_unspec_error("memory set", "memory region"); + } + if (not was_set.value_and_width) { + throw detail_::make_unspec_error("memory set", "value to set"); + } + return graph_template.insert.node(params_); + } +}; // typed_builder_t + +#if CUDA_VERSION >= 11000 +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::memory_free; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + struct { + bool address { false }; + } was_set; + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + const params_type& params() { return params_; } + + this_type region(void* address) noexcept { return do_([&] { params_ = address; was_set.address = true;}); } + this_type region(memory::region_t allocated_region) noexcept { return this->region(allocated_region.data()); } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.address) { + throw detail_::make_unspec_error("memory free", "allocated region starting address"); + } + return graph_template.insert.node(params_); + } +}; // typed_builder_t + +#endif // CUDA_VERSION >= 11000 + +#if CUDA_VERSION >= 11070 +template <> +class typed_builder_t { +public: + static constexpr const kind_t kind = kind_t::memory_barrier; + using this_type = typed_builder_t; + using built_type = typed_node_t; + using traits = cuda::graph::node::detail_::kind_traits; + using params_type = traits::parameters_type; + +protected: + params_type params_; + struct { + bool context { false }; + bool barrier_socpe { false }; + } was_set; + + // This wrapper method ensures the builder-ish behavior, i.e. always returning the builder + // for further work via method invocation. + template this_type& do_(F f) { f(); return *this; } + +public: + const params_type& params() { return params_; } + + this_type context(context_t context) noexcept + { + return do_([&] { + params_.first = std::move(context); + was_set.context = true;}); + } + + this_type context(memory::barrier_scope_t barrier_socpe) noexcept + { + return do_([&] { params_.second = barrier_socpe; was_set.barrier_socpe = true;}); + } + + [[maybe_unused]] built_type build_within(const cuda::graph::template_t& graph_template) + { + if (not was_set.context) { + throw detail_::make_unspec_error("memory barrier", "CUDA context"); + } + if (not was_set.barrier_socpe) { + throw detail_::make_unspec_error("memory barrier", "barrier scope"); + } + return graph_template.insert.node(params_); + } +}; // typed_builder_t + +#endif // CUDA_VERSION >= 11070 + +} // namespace node + +} // namespace graph + +} // namespace cuda + +#endif // CUDA_VERSION >= 10000 + +#endif //CUDA_API_WRAPPERS_NODE_BUILDER_HPP diff --git a/src/cuda/api/graph/template.hpp b/src/cuda/api/graph/template.hpp new file mode 100644 index 00000000..bc7e3f34 --- /dev/null +++ b/src/cuda/api/graph/template.hpp @@ -0,0 +1,829 @@ +/** + * @file + * + * @brief An implementation of a subclass of @ref `kernel_t` for kernels + * compiled together with the host-side program. + * + * @todo Implement batch mem op insertion and param setting: + * cuGraphAddBatchMemOpNode, cuGraphBatchMemOpNodeGetParams, cuGraphBatchMemOpNodeSetParams + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_GRAPH_TEMPLATE_HPP +#define CUDA_API_WRAPPERS_GRAPH_TEMPLATE_HPP + +#if CUDA_VERSION >= 10000 + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cuda { + +///@cond +class device_t; +class stream_t; +///@endcond + +namespace graph { + +///@endcond +class template_t; +class instance_t; +///@endcond + +namespace node { + +namespace detail_ { + +// I'm not so sure about this... +using edge_t = ::std::pair; + +inline ::std::string identify(const edge_t &edge) +{ + return ::std::string("edge from " + node::detail_::identify(edge.first) + + " to " + node::detail_::identify(edge.second)); +} + +template +handle_t as_handle(const NodeOrHandle& node_or_handle) noexcept +{ + return node_or_handle.handle(); +} + +template <> inline handle_t as_handle(const handle_t& handle) noexcept { return handle; } + +template