Fixes #175, #458: Added graph support, including stream capture and a…

… graph node builder class. Also added two modified CUDA sample programs using graph support: * graphMemoryNodes * jacobiCudaGraphs
eyalroz · May 4, 2024 · db63160 · db63160
1 parent aad918a
commit db63160
Show file tree

Hide file tree

Showing 24 changed files with 4,452 additions and 23 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,6 +163,7 @@ write_basic_package_version_file(
 	COMPATIBILITY ${COMPAT_SETTING}
 )
 
+
 install(
   FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-api-wrappers-config-version.cmake"
   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-api-wrappers"

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -144,7 +144,19 @@ if(USE_COOPERATIVE_GROUPS AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
 	add_executable(binaryPartitionCG modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu)
 endif()
 add_executable(bandwidthtest modified_cuda_samples/bandwidthtest/bandwidthtest.cpp)
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.0")
+	add_executable(simpleCudaGraphs modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu)
+	add_executable(jacobiCudaGraphs
+		modified_cuda_samples/jacobiCudaGraphs/main.cpp
+		modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
+	)
+endif()
 #----
+add_custom_target(modified_cuda_samples)
+add_dependencies(modified_cuda_samples
+	vectorAdd inlinePTX simpleStreams simpleIPC
+#	simpleCudaGraphs graphMemoryNodes jacobiCudaGraphs
+)
 
 add_executable(version_management by_api_module/version_management.cpp)
 add_executable(error_handling by_api_module/error_handling.cu)

diff --git a/examples/common.hpp b/examples/common.hpp
@@ -35,6 +35,18 @@ bool your_type_was_() { return true; }
 #define print_type_of(_x) your_type_was_<decltype(_x)>()
 #endif
 
+inline const char* ordinal_suffix(int n)
+{
+	static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
+	auto ord = n % 100;
+	if (ord / 10 == 1) { ord = 0; }
+	ord = ord % 10;
+	return suffixes[ord > 3 ? 0 : ord];
+}
+
+template <typename N = int>
+inline ::std::string xth(N n) { return ::std::to_string(n) + ordinal_suffix(n); }
+
 const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref)
 {
 	static const char* cache_preference_names[] = {

diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
@@ -0,0 +1,186 @@
+#include "jacobi_kernels.cuh"
+#include "jacobi.h"
+
+#include <vector>
+#include <iomanip>
+#include <iostream>
+#include <cuda/api.hpp>
+
+static void finalize_error(
+	const cuda::stream_t& stream, span<double> d_sum, const cuda::launch_configuration_t& launch_config,
+	double& sum, int k, const span<double> x_to_overwrite)
+{
+	stream.enqueue.memzero(d_sum);
+	auto final_error_launch_config = launch_config;
+	final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1;
+	auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size;
+	final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double);
+	// TODO: Double-check the original source to ensure we're using the right x here
+	stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data());
+	stream.enqueue.copy(&sum, d_sum);
+	stream.synchronize();
+	report_error_sum("GPU", k + 1, sum);
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::graph_with_set_kernel_params>(
+	const cuda::device_t &device,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	double sum;
+
+	auto graph = cuda::graph::create();
+
+	using cuda::graph::node::kind_t;
+
+	auto memset_node = [&] {
+		cuda::graph::node::parameters_t<kind_t::memory_set> params;
+		params.value = 0;
+		params.width_in_bytes = 4;
+		params.region = d_sum;
+		return graph.insert.node<kind_t::memory_set>(params);
+	}();
+
+	auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod);
+	struct { cuda::graph::node::parameters_t<kind_t::kernel_launch>  odd, even; } kernel_params = {
+		{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) },
+		{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) },
+	};
+	auto jacobi_kernel_node = graph.insert.node<kind_t::kernel_launch>(kernel_params.even);
+
+	graph.insert.edge(memset_node, jacobi_kernel_node);
+
+	auto memcpy_node = [&] {
+		cuda::memory::copy_parameters_t<3> params;
+		params.set_source(d_sum);
+		params.set_destination(&sum, 1);
+		params.set_extent<double>(1);
+		params.clear_offsets();
+		params.clear_rest();
+		return graph.insert.node<cuda::graph::node::kind_t::memcpy>(params);
+	}();
+
+	graph.insert.edge(jacobi_kernel_node, memcpy_node);
+
+
+	cuda::graph::instance_t instance = graph.instantiate();
+
+//	std::cout << "settings node params for the kernel node with k ==  " << k << " and params.marshalled_arguments.size() = "
+//			  << params.marshalled_arguments.size() << std::endl;
+
+	for (int k = 0; k < num_iterations; k++) {
+		instance.launch(stream);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			auto x_to_overwrite = ((k & 1) == 0) ? x : x_new;
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+		// Odd iterations have an even value of k, since we start with k == 0;
+		// but - here we sent
+		const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd;
+		instance.set_node_parameters<kind_t::kernel_launch>(jacobi_kernel_node, next_iteration_params);
+	}
+	return sum;
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::graph_with_exec_update>(
+	const cuda::device_t &,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	::std::unique_ptr<cuda::graph::instance_t> instance_ptr{};
+
+	double sum = 0.0;
+	for (int k = 0; k < num_iterations; k++) {
+		stream.begin_capture(cuda::stream::capture::mode_t::global);
+		stream.enqueue.memzero(d_sum);
+		auto x_to_read = ((k & 1) == 0) ? x : x_new;
+		auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
+		stream.enqueue.kernel_launch(JacobiMethod, launch_config,
+			A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
+		stream.enqueue.copy(&sum, d_sum);
+		auto graph = stream.end_capture();
+
+		if (instance_ptr == nullptr) {
+			auto instance = graph.instantiate();
+			instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)});
+		}
+		else {
+			instance_ptr->update(graph);
+			// Note: The original code tried to re-instantiate if the update
+			// of the instance failed, we don't do this.
+		}
+		stream.enqueue.graph_launch(*instance_ptr);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+	}
+
+	return sum;
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::non_graph_gpu>(
+	const cuda::device_t &,
+	const cuda::stream_t &stream,
+	span<float const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	double sum;
+	for (int k = 0; k < num_iterations; k++) {
+		stream.enqueue.memzero(d_sum);
+		auto x_to_read = ((k & 1) == 0) ? x : x_new;
+		auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
+		stream.enqueue.kernel_launch(JacobiMethod, launch_config,
+			A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
+		stream.enqueue.copy(&sum, d_sum);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+	}
+
+	return sum;
+}
+
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <[email protected]>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JACOBI_H
+#define JACOBI_H
+
+#define N_ROWS 512
+
+#include <cuda/api.hpp>
+
+#if __cplusplus >= 202001L
+using span = std::span;
+#else
+using cuda::span;
+#endif
+
+#define N_ROWS 512
+
+enum computation_method_t {
+	graph_with_set_kernel_params = 0,
+	graph_with_exec_update = 1,
+	non_graph_gpu = 2,
+	cpu = 3
+};
+
+inline const char* method_name(computation_method_t method)
+{
+	static const char* method_names[] = {
+		"graph_with_set_kernel_params",
+		"graph_with_exec_update",
+		"non_graph_gpu",
+		"cpu"
+	};
+	return method_names[method];
+}
+
+void report_error_sum(const char* where, int num_iterations, double sum_on_cpu);
+
+template <computation_method_t Method>
+double do_jacobi_inner(
+	const cuda:: device_t& device,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float conv_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum);
+
+
+#endif // JACOBI_H