-
-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
… graph node builder class. Also added two modified CUDA sample programs using graph support: * graphMemoryNodes * jacobiCudaGraphs
- Loading branch information
Showing
24 changed files
with
4,449 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
186 changes: 186 additions & 0 deletions
186
examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
#include "jacobi_kernels.cuh" | ||
#include "jacobi.h" | ||
|
||
#include <vector> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <cuda/api.hpp> | ||
|
||
static void finalize_error( | ||
const cuda::stream_t& stream, span<double> d_sum, const cuda::launch_configuration_t& launch_config, | ||
double& sum, int k, const span<double> x_to_overwrite) | ||
{ | ||
stream.enqueue.memzero(d_sum); | ||
auto final_error_launch_config = launch_config; | ||
final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1; | ||
auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size; | ||
final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double); | ||
// TODO: Double-check the original source to ensure we're using the right x here | ||
stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data()); | ||
stream.enqueue.copy(&sum, d_sum); | ||
stream.synchronize(); | ||
report_error_sum("GPU", k + 1, sum); | ||
} | ||
|
||
template<> | ||
double do_jacobi_inner<computation_method_t::graph_with_set_kernel_params>( | ||
const cuda::device_t &device, | ||
const cuda::stream_t &stream, | ||
span<float const> A, | ||
span<double const> b, | ||
float convergence_threshold, | ||
int num_iterations, | ||
span<double> x, | ||
span<double> x_new, | ||
span<double> d_sum) | ||
{ | ||
auto launch_config = cuda::launch_config_builder() | ||
.block_size(256) | ||
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) | ||
.build(); | ||
|
||
double sum; | ||
|
||
auto graph = cuda::graph::create(); | ||
|
||
using cuda::graph::node::kind_t; | ||
|
||
auto memset_node = [&] { | ||
cuda::graph::node::parameters_t<kind_t::memory_set> params; | ||
params.value = 0; | ||
params.width_in_bytes = 4; | ||
params.region = d_sum; | ||
return graph.insert.node<kind_t::memory_set>(params); | ||
}(); | ||
|
||
auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod); | ||
struct { cuda::graph::node::parameters_t<kind_t::kernel_launch> odd, even; } kernel_params = { | ||
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) }, | ||
{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) }, | ||
}; | ||
auto jacobi_kernel_node = graph.insert.node<kind_t::kernel_launch>(kernel_params.even); | ||
|
||
graph.insert.edge(memset_node, jacobi_kernel_node); | ||
|
||
auto memcpy_node = [&] { | ||
cuda::memory::copy_parameters_t<3> params; | ||
params.set_source(d_sum); | ||
params.set_destination(&sum, 1); | ||
params.set_extent<double>(1); | ||
params.clear_offsets(); | ||
params.clear_rest(); | ||
return graph.insert.node<cuda::graph::node::kind_t::memcpy>(params); | ||
}(); | ||
|
||
graph.insert.edge(jacobi_kernel_node, memcpy_node); | ||
|
||
|
||
cuda::graph::instance_t instance = graph.instantiate(); | ||
|
||
// std::cout << "settings node params for the kernel node with k == " << k << " and params.marshalled_arguments.size() = " | ||
// << params.marshalled_arguments.size() << std::endl; | ||
|
||
for (int k = 0; k < num_iterations; k++) { | ||
instance.launch(stream); | ||
stream.synchronize(); | ||
|
||
if (sum <= convergence_threshold) { | ||
auto x_to_overwrite = ((k & 1) == 0) ? x : x_new; | ||
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); | ||
break; | ||
} | ||
// Odd iterations have an even value of k, since we start with k == 0; | ||
// but - here we sent | ||
const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd; | ||
instance.set_node_parameters<kind_t::kernel_launch>(jacobi_kernel_node, next_iteration_params); | ||
} | ||
return sum; | ||
} | ||
|
||
template<> | ||
double do_jacobi_inner<computation_method_t::graph_with_exec_update>( | ||
const cuda::device_t &, | ||
const cuda::stream_t &stream, | ||
span<float const> A, | ||
span<double const> b, | ||
float convergence_threshold, | ||
int num_iterations, | ||
span<double> x, | ||
span<double> x_new, | ||
span<double> d_sum) | ||
{ | ||
auto launch_config = cuda::launch_config_builder() | ||
.block_size(256) | ||
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) | ||
.build(); | ||
|
||
::std::unique_ptr<cuda::graph::instance_t> instance_ptr{}; | ||
|
||
double sum = 0.0; | ||
for (int k = 0; k < num_iterations; k++) { | ||
stream.begin_capture(cuda::stream::capture::mode_t::global); | ||
stream.enqueue.memzero(d_sum); | ||
auto x_to_read = ((k & 1) == 0) ? x : x_new; | ||
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x; | ||
stream.enqueue.kernel_launch(JacobiMethod, launch_config, | ||
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data()); | ||
stream.enqueue.copy(&sum, d_sum); | ||
auto graph = stream.end_capture(); | ||
|
||
if (instance_ptr == nullptr) { | ||
auto instance = graph.instantiate(); | ||
instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)}); | ||
} | ||
else { | ||
instance_ptr->update(graph); | ||
// Note: The original code tried to re-instantiate if the update | ||
// of the instance failed, we don't do this. | ||
} | ||
stream.enqueue.graph_launch(*instance_ptr); | ||
stream.synchronize(); | ||
|
||
if (sum <= convergence_threshold) { | ||
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); | ||
break; | ||
} | ||
} | ||
|
||
return sum; | ||
} | ||
|
||
template<> | ||
double do_jacobi_inner<computation_method_t::non_graph_gpu>( | ||
const cuda::device_t &, | ||
const cuda::stream_t &stream, | ||
span<float const> A, | ||
span<double const> b, | ||
float convergence_threshold, | ||
int num_iterations, | ||
span<double> x, | ||
span<double> x_new, | ||
span<double> d_sum) | ||
{ | ||
auto launch_config = cuda::launch_config_builder() | ||
.block_size(256) | ||
.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1) | ||
.build(); | ||
|
||
double sum; | ||
for (int k = 0; k < num_iterations; k++) { | ||
stream.enqueue.memzero(d_sum); | ||
auto x_to_read = ((k & 1) == 0) ? x : x_new; | ||
auto x_to_overwrite = ((k & 1) == 0) ? x_new : x; | ||
stream.enqueue.kernel_launch(JacobiMethod, launch_config, | ||
A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data()); | ||
stream.enqueue.copy(&sum, d_sum); | ||
stream.synchronize(); | ||
|
||
if (sum <= convergence_threshold) { | ||
finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite); | ||
break; | ||
} | ||
} | ||
|
||
return sum; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
* Modifications Copyright (c) 2023, Eyal Rozenberg <[email protected]> | ||
* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions | ||
* are met: | ||
* * Redistributions of source code must retain the above copyright | ||
* notice, this list of conditions and the following disclaimer. | ||
* * Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* * Neither the name of NVIDIA CORPORATION nor the names of its | ||
* contributors may be used to endorse or promote products derived | ||
* from this software without specific prior written permission. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
*/ | ||
|
||
#ifndef JACOBI_H | ||
#define JACOBI_H | ||
|
||
#define N_ROWS 512 | ||
|
||
#include <cuda/api.hpp> | ||
|
||
#if __cplusplus >= 202001L | ||
using span = std::span; | ||
#else | ||
using cuda::span; | ||
#endif | ||
|
||
#define N_ROWS 512 | ||
|
||
enum computation_method_t { | ||
graph_with_set_kernel_params = 0, | ||
graph_with_exec_update = 1, | ||
non_graph_gpu = 2, | ||
cpu = 3 | ||
}; | ||
|
||
inline const char* method_name(computation_method_t method) | ||
{ | ||
static const char* method_names[] = { | ||
"graph_with_set_kernel_params", | ||
"graph_with_exec_update", | ||
"non_graph_gpu", | ||
"cpu" | ||
}; | ||
return method_names[method]; | ||
} | ||
|
||
void report_error_sum(const char* where, int num_iterations, double sum_on_cpu); | ||
|
||
template <computation_method_t Method> | ||
double do_jacobi_inner( | ||
const cuda:: device_t& device, | ||
const cuda::stream_t &stream, | ||
span<float const> A, | ||
span<double const> b, | ||
float conv_threshold, | ||
int num_iterations, | ||
span<double> x, | ||
span<double> x_new, | ||
span<double> d_sum); | ||
|
||
|
||
#endif // JACOBI_H |
Oops, something went wrong.