Skip to content

Commit

Permalink
Fixes #175, #458: Added graph support, including stream capture and a…
Browse files Browse the repository at this point in the history
… graph node builder class.

Also added two modified CUDA sample programs using graph support:

* graphMemoryNodes
* jacobiCudaGraphs
  • Loading branch information
eyalroz committed May 4, 2024
1 parent aad918a commit db63160
Show file tree
Hide file tree
Showing 24 changed files with 4,452 additions and 23 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ write_basic_package_version_file(
COMPATIBILITY ${COMPAT_SETTING}
)


install(
FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-api-wrappers-config-version.cmake"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-api-wrappers"
Expand Down
12 changes: 12 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,19 @@ if(USE_COOPERATIVE_GROUPS AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
add_executable(binaryPartitionCG modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu)
endif()
add_executable(bandwidthtest modified_cuda_samples/bandwidthtest/bandwidthtest.cpp)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.0")
add_executable(simpleCudaGraphs modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu)
add_executable(jacobiCudaGraphs
modified_cuda_samples/jacobiCudaGraphs/main.cpp
modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
)
endif()
#----
add_custom_target(modified_cuda_samples)
add_dependencies(modified_cuda_samples
vectorAdd inlinePTX simpleStreams simpleIPC
# simpleCudaGraphs graphMemoryNodes jacobiCudaGraphs
)

add_executable(version_management by_api_module/version_management.cpp)
add_executable(error_handling by_api_module/error_handling.cu)
Expand Down
12 changes: 12 additions & 0 deletions examples/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ bool your_type_was_() { return true; }
#define print_type_of(_x) your_type_was_<decltype(_x)>()
#endif

inline const char* ordinal_suffix(int n)
{
static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
auto ord = n % 100;
if (ord / 10 == 1) { ord = 0; }
ord = ord % 10;
return suffixes[ord > 3 ? 0 : ord];
}

template <typename N = int>
inline ::std::string xth(N n) { return ::std::to_string(n) + ordinal_suffix(n); }

const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref)
{
static const char* cache_preference_names[] = {
Expand Down
186 changes: 186 additions & 0 deletions examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#include "jacobi_kernels.cuh"
#include "jacobi.h"

#include <vector>
#include <iomanip>
#include <iostream>
#include <cuda/api.hpp>

static void finalize_error(
const cuda::stream_t& stream, span<double> d_sum, const cuda::launch_configuration_t& launch_config,
double& sum, int k, const span<double> x_to_overwrite)
{
stream.enqueue.memzero(d_sum);
auto final_error_launch_config = launch_config;
final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1;
auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size;
final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double);
// TODO: Double-check the original source to ensure we're using the right x here
stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data());
stream.enqueue.copy(&sum, d_sum);
stream.synchronize();
report_error_sum("GPU", k + 1, sum);
}

template<>
double do_jacobi_inner<computation_method_t::graph_with_set_kernel_params>(
const cuda::device_t &device,
const cuda::stream_t &stream,
span<float const> A,
span<double const> b,
float convergence_threshold,
int num_iterations,
span<double> x,
span<double> x_new,
span<double> d_sum)
{
auto launch_config = cuda::launch_config_builder()
.block_size(256)
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
.build();

double sum;

auto graph = cuda::graph::create();

using cuda::graph::node::kind_t;

auto memset_node = [&] {
cuda::graph::node::parameters_t<kind_t::memory_set> params;
params.value = 0;
params.width_in_bytes = 4;
params.region = d_sum;
return graph.insert.node<kind_t::memory_set>(params);
}();

auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod);
struct { cuda::graph::node::parameters_t<kind_t::kernel_launch> odd, even; } kernel_params = {
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) },
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) },
};
auto jacobi_kernel_node = graph.insert.node<kind_t::kernel_launch>(kernel_params.even);

graph.insert.edge(memset_node, jacobi_kernel_node);

auto memcpy_node = [&] {
cuda::memory::copy_parameters_t<3> params;
params.set_source(d_sum);
params.set_destination(&sum, 1);
params.set_extent<double>(1);
params.clear_offsets();
params.clear_rest();
return graph.insert.node<cuda::graph::node::kind_t::memcpy>(params);
}();

graph.insert.edge(jacobi_kernel_node, memcpy_node);


cuda::graph::instance_t instance = graph.instantiate();

// std::cout << "settings node params for the kernel node with k == " << k << " and params.marshalled_arguments.size() = "
// << params.marshalled_arguments.size() << std::endl;

for (int k = 0; k < num_iterations; k++) {
instance.launch(stream);
stream.synchronize();

if (sum <= convergence_threshold) {
auto x_to_overwrite = ((k & 1) == 0) ? x : x_new;
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
break;
}
// Odd iterations have an even value of k, since we start with k == 0;
// but - here we sent
const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd;
instance.set_node_parameters<kind_t::kernel_launch>(jacobi_kernel_node, next_iteration_params);
}
return sum;
}

template<>
double do_jacobi_inner<computation_method_t::graph_with_exec_update>(
const cuda::device_t &,
const cuda::stream_t &stream,
span<float const> A,
span<double const> b,
float convergence_threshold,
int num_iterations,
span<double> x,
span<double> x_new,
span<double> d_sum)
{
auto launch_config = cuda::launch_config_builder()
.block_size(256)
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
.build();

::std::unique_ptr<cuda::graph::instance_t> instance_ptr{};

double sum = 0.0;
for (int k = 0; k < num_iterations; k++) {
stream.begin_capture(cuda::stream::capture::mode_t::global);
stream.enqueue.memzero(d_sum);
auto x_to_read = ((k & 1) == 0) ? x : x_new;
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
stream.enqueue.kernel_launch(JacobiMethod, launch_config,
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
stream.enqueue.copy(&sum, d_sum);
auto graph = stream.end_capture();

if (instance_ptr == nullptr) {
auto instance = graph.instantiate();
instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)});
}
else {
instance_ptr->update(graph);
// Note: The original code tried to re-instantiate if the update
// of the instance failed, we don't do this.
}
stream.enqueue.graph_launch(*instance_ptr);
stream.synchronize();

if (sum <= convergence_threshold) {
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
break;
}
}

return sum;
}

template<>
double do_jacobi_inner<computation_method_t::non_graph_gpu>(
const cuda::device_t &,
const cuda::stream_t &stream,
span<float const> A,
span<double const> b,
float convergence_threshold,
int num_iterations,
span<double> x,
span<double> x_new,
span<double> d_sum)
{
auto launch_config = cuda::launch_config_builder()
.block_size(256)
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
.build();

double sum;
for (int k = 0; k < num_iterations; k++) {
stream.enqueue.memzero(d_sum);
auto x_to_read = ((k & 1) == 0) ? x : x_new;
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
stream.enqueue.kernel_launch(JacobiMethod, launch_config,
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
stream.enqueue.copy(&sum, d_sum);
stream.synchronize();

if (sum <= convergence_threshold) {
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
break;
}
}

return sum;
}

77 changes: 77 additions & 0 deletions examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2023, Eyal Rozenberg <[email protected]>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef JACOBI_H
#define JACOBI_H

#define N_ROWS 512

#include <cuda/api.hpp>

#if __cplusplus >= 202001L
using span = std::span;
#else
using cuda::span;
#endif

#define N_ROWS 512

enum computation_method_t {
graph_with_set_kernel_params = 0,
graph_with_exec_update = 1,
non_graph_gpu = 2,
cpu = 3
};

inline const char* method_name(computation_method_t method)
{
static const char* method_names[] = {
"graph_with_set_kernel_params",
"graph_with_exec_update",
"non_graph_gpu",
"cpu"
};
return method_names[method];
}

void report_error_sum(const char* where, int num_iterations, double sum_on_cpu);

template <computation_method_t Method>
double do_jacobi_inner(
const cuda:: device_t& device,
const cuda::stream_t &stream,
span<float const> A,
span<double const> b,
float conv_threshold,
int num_iterations,
span<double> x,
span<double> x_new,
span<double> d_sum);


#endif // JACOBI_H
Loading

0 comments on commit db63160

Please sign in to comment.