From 24f622b376352a82963511f85ae164b10d8230d2 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Sat, 14 Jan 2023 20:21:53 +0200
Subject: [PATCH] Fixes #175, #458: Added graph support, including stream
 capture and a graph node builder class.

Also added two modified CUDA sample programs using graph support:

* graphMemoryNodes
* jacobiCudaGraphs
---
 CMakeLists.txt                                |   1 +
 examples/CMakeLists.txt                       |   7 +
 examples/common.hpp                           |  12 +
 .../jacobiCudaGraphs/jacobi.cu                | 186 ++++
 .../jacobiCudaGraphs/jacobi.h                 |  77 ++
 .../jacobiCudaGraphs/jacobi_kernels.cuh       | 172 ++++
 .../jacobiCudaGraphs/main.cpp                 | 182 ++++
 .../simpleCudaGraphs/simpleCudaGraphs.cu      | 508 +++++++++++
 examples/other/jitify/string_view.hpp         |  34 +-
 src/cuda/api.hpp                              |  13 +-
 src/cuda/api/context.hpp                      |  42 +
 src/cuda/api/detail/for_each_argument.hpp     |  26 +
 src/cuda/api/device.hpp                       |  68 ++
 src/cuda/api/external.hpp                     |   4 +
 src/cuda/api/graph/identify.hpp               | 100 +++
 src/cuda/api/graph/instance.hpp               | 504 +++++++++++
 src/cuda/api/graph/node.hpp                   | 152 ++++
 src/cuda/api/graph/node_builder.hpp           | 606 +++++++++++++
 src/cuda/api/graph/template.hpp               | 829 ++++++++++++++++++
 src/cuda/api/graph/typed_node.hpp             | 558 ++++++++++++
 src/cuda/api/memory_pool.hpp                  |   4 +
 src/cuda/api/multi_wrapper_impls/graph.hpp    | 230 +++++
 src/cuda/api/stream.hpp                       |  79 +-
 src/cuda/api/types.hpp                        |  78 +-
 24 files changed, 4449 insertions(+), 23 deletions(-)
 create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
 create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h
 create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh
 create mode 100644 examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
 create mode 100644 examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu
 create mode 100644 src/cuda/api/detail/for_each_argument.hpp
 create mode 100644 src/cuda/api/graph/identify.hpp
 create mode 100644 src/cuda/api/graph/instance.hpp
 create mode 100644 src/cuda/api/graph/node.hpp
 create mode 100644 src/cuda/api/graph/node_builder.hpp
 create mode 100644 src/cuda/api/graph/template.hpp
 create mode 100644 src/cuda/api/graph/typed_node.hpp
 create mode 100644 src/cuda/api/multi_wrapper_impls/graph.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2fdfece9..69ceda5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,6 +163,7 @@ write_basic_package_version_file(
 	COMPATIBILITY ${COMPAT_SETTING}
 )
 
+
 install(
   FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda-api-wrappers-config-version.cmake"
   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cuda-api-wrappers"
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5adaaadc..078b1591 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -144,6 +144,13 @@ if(USE_COOPERATIVE_GROUPS AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
 	add_executable(binaryPartitionCG modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu)
 endif()
 add_executable(bandwidthtest modified_cuda_samples/bandwidthtest/bandwidthtest.cpp)
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.0")
+	add_executable(simpleCudaGraphs modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu)
+	add_executable(jacobiCudaGraphs
+		modified_cuda_samples/jacobiCudaGraphs/main.cpp
+		modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
+	)
+endif()
 #----
 
 add_executable(version_management by_api_module/version_management.cpp)
diff --git a/examples/common.hpp b/examples/common.hpp
index 4a7467ba..62e5a3a4 100644
--- a/examples/common.hpp
+++ b/examples/common.hpp
@@ -35,6 +35,18 @@ bool your_type_was_() { return true; }
 #define print_type_of(_x) your_type_was_<decltype(_x)>()
 #endif
 
+inline const char* ordinal_suffix(int n)
+{
+	static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
+	auto ord = n % 100;
+	if (ord / 10 == 1) { ord = 0; }
+	ord = ord % 10;
+	return suffixes[ord > 3 ? 0 : ord];
+}
+
+template <typename N = int>
+inline ::std::string xth(N n) { return ::std::to_string(n) + ordinal_suffix(n); }
+
 const char* cache_preference_name(cuda::multiprocessor_cache_preference_t pref)
 {
 	static const char* cache_preference_names[] = {
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
new file mode 100644
index 00000000..a583d5ec
--- /dev/null
+++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.cu
@@ -0,0 +1,186 @@
+#include "jacobi_kernels.cuh"
+#include "jacobi.h"
+
+#include <vector>
+#include <iomanip>
+#include <iostream>
+#include <cuda/api.hpp>
+
+static void finalize_error(
+	const cuda::stream_t& stream, span<double> d_sum, const cuda::launch_configuration_t& launch_config,
+	double& sum, int k, const span<double> x_to_overwrite)
+{
+	stream.enqueue.memzero(d_sum);
+	auto final_error_launch_config = launch_config;
+	final_error_launch_config.dimensions.grid.x = (N_ROWS / final_error_launch_config.dimensions.block.x) + 1;
+	auto warps_per_block = final_error_launch_config.dimensions.block.x / cuda::warp_size;
+	final_error_launch_config.dynamic_shared_memory_size = (warps_per_block + 1) * sizeof(double);
+	// TODO: Double-check the original source to ensure we're using the right x here
+	stream.enqueue.kernel_launch(finalError, final_error_launch_config, x_to_overwrite.data(), d_sum.data());
+	stream.enqueue.copy(&sum, d_sum);
+	stream.synchronize();
+	report_error_sum("GPU", k + 1, sum);
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::graph_with_set_kernel_params>(
+	const cuda::device_t &device,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	double sum;
+
+	auto graph = cuda::graph::create();
+
+	using cuda::graph::node::kind_t;
+
+	auto memset_node = [&] {
+		cuda::graph::node::parameters_t<kind_t::memory_set> params;
+		params.value = 0;
+		params.width_in_bytes = 4;
+		params.region = d_sum;
+		return graph.insert.node<kind_t::memory_set>(params);
+	}();
+
+	auto jacobi_kernel = cuda::kernel::get(device, JacobiMethod);
+	struct { cuda::graph::node::parameters_t<kind_t::kernel_launch>  odd, even; } kernel_params = {
+		{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x, x_new, d_sum) },
+		{ jacobi_kernel, launch_config, cuda::graph::make_kernel_argument_pointers(A, b, convergence_threshold, x_new, x, d_sum) },
+	};
+	auto jacobi_kernel_node = graph.insert.node<kind_t::kernel_launch>(kernel_params.even);
+
+	graph.insert.edge(memset_node, jacobi_kernel_node);
+
+	auto memcpy_node = [&] {
+		cuda::memory::copy_parameters_t<3> params;
+		params.set_source(d_sum);
+		params.set_destination(&sum, 1);
+		params.set_extent<double>(1);
+		params.clear_offsets();
+		params.clear_rest();
+		return graph.insert.node<cuda::graph::node::kind_t::memcpy>(params);
+	}();
+
+	graph.insert.edge(jacobi_kernel_node, memcpy_node);
+
+
+	cuda::graph::instance_t instance = graph.instantiate();
+
+//	std::cout << "settings node params for the kernel node with k ==  " << k << " and params.marshalled_arguments.size() = "
+//			  << params.marshalled_arguments.size() << std::endl;
+
+	for (int k = 0; k < num_iterations; k++) {
+		instance.launch(stream);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			auto x_to_overwrite = ((k & 1) == 0) ? x : x_new;
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+		// Odd iterations have an even value of k, since we start with k == 0;
+		// but - here we sent
+		const auto& next_iteration_params = ((k & 1) == 0) ? kernel_params.even : kernel_params.odd;
+		instance.set_node_parameters<kind_t::kernel_launch>(jacobi_kernel_node, next_iteration_params);
+	}
+	return sum;
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::graph_with_exec_update>(
+	const cuda::device_t &,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	::std::unique_ptr<cuda::graph::instance_t> instance_ptr{};
+
+	double sum = 0.0;
+	for (int k = 0; k < num_iterations; k++) {
+		stream.begin_capture(cuda::stream::capture::mode_t::global);
+		stream.enqueue.memzero(d_sum);
+		auto x_to_read = ((k & 1) == 0) ? x : x_new;
+		auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
+		stream.enqueue.kernel_launch(JacobiMethod, launch_config,
+			A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
+		stream.enqueue.copy(&sum, d_sum);
+		auto graph = stream.end_capture();
+
+		if (instance_ptr == nullptr) {
+			auto instance = graph.instantiate();
+			instance_ptr.reset(new cuda::graph::instance_t{::std::move(instance)});
+		}
+		else {
+			instance_ptr->update(graph);
+			// Note: The original code tried to re-instantiate if the update
+			// of the instance failed, we don't do this.
+		}
+		stream.enqueue.graph_launch(*instance_ptr);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+	}
+
+	return sum;
+}
+
+template<>
+double do_jacobi_inner<computation_method_t::non_graph_gpu>(
+	const cuda::device_t &,
+	const cuda::stream_t &stream,
+	span<float const> A,
+	span<double const> b,
+	float convergence_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum)
+{
+	auto launch_config = cuda::launch_config_builder()
+		.block_size(256)
+		.grid_dimensions((N_ROWS / ROWS_PER_CTA) + 2, 1, 1)
+		.build();
+
+	double sum;
+	for (int k = 0; k < num_iterations; k++) {
+		stream.enqueue.memzero(d_sum);
+		auto x_to_read = ((k & 1) == 0) ? x : x_new;
+		auto x_to_overwrite = ((k & 1) == 0) ? x_new : x;
+		stream.enqueue.kernel_launch(JacobiMethod, launch_config,
+			A.data(), b.data(), convergence_threshold, x_to_read.data(), x_to_overwrite.data(), d_sum.data());
+		stream.enqueue.copy(&sum, d_sum);
+		stream.synchronize();
+
+		if (sum <= convergence_threshold) {
+			finalize_error(stream, d_sum, launch_config, sum, k, x_to_overwrite);
+			break;
+		}
+	}
+
+	return sum;
+}
+
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h
new file mode 100644
index 00000000..9b95b1ba
--- /dev/null
+++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <eyalroz1@gmx.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef JACOBI_H
+#define JACOBI_H
+
+#define N_ROWS 512
+
+#include <cuda/api.hpp>
+
+#if __cplusplus >= 202001L
+using span = std::span;
+#else
+using cuda::span;
+#endif
+
+#define N_ROWS 512
+
+enum computation_method_t {
+	graph_with_set_kernel_params = 0,
+	graph_with_exec_update = 1,
+	non_graph_gpu = 2,
+	cpu = 3
+};
+
+inline const char* method_name(computation_method_t method)
+{
+	static const char* method_names[] = {
+		"graph_with_set_kernel_params",
+		"graph_with_exec_update",
+		"non_graph_gpu",
+		"cpu"
+	};
+	return method_names[method];
+}
+
+void report_error_sum(const char* where, int num_iterations, double sum_on_cpu);
+
+template <computation_method_t Method>
+double do_jacobi_inner(
+	const cuda:: device_t& device,
+	const cuda::stream_t &stream,
+	span<float  const> A,
+	span<double const> b,
+	float conv_threshold,
+	int num_iterations,
+	span<double> x,
+	span<double> x_new,
+	span<double> d_sum);
+
+
+#endif // JACOBI_H
\ No newline at end of file
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh
new file mode 100644
index 00000000..a57a6de4
--- /dev/null
+++ b/examples/modified_cuda_samples/jacobiCudaGraphs/jacobi_kernels.cuh
@@ -0,0 +1,172 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <eyalroz1@gmx.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jacobi.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+// 8 Rows of square-matrix A processed by each CTA.
+// This can be max 32 and only power of 2 (i.e., 2/4/8/16/32).
+#define ROWS_PER_CTA 8
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+__device__ double atomicAdd(double *address, double val) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
+    // NaN)
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+static __global__ void JacobiMethod(
+	float const *A,
+	double const *b,
+    float const convergence_threshold,
+	double const *x,
+    double *x_new,
+	double *sum)
+{
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  __shared__ double x_shared[N_ROWS];  // N_ROWS == n
+  __shared__ double b_shared[ROWS_PER_CTA + 1];
+
+  for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) {
+    x_shared[i] = x[i];
+  }
+
+  if (threadIdx.x < ROWS_PER_CTA) {
+    int k = threadIdx.x;
+    for (int i = k + (blockIdx.x * ROWS_PER_CTA);
+         (k < ROWS_PER_CTA) && (i < N_ROWS);
+         k += ROWS_PER_CTA, i += ROWS_PER_CTA) {
+      b_shared[i % (ROWS_PER_CTA + 1)] = b[i];
+    }
+  }
+
+  cg::sync(cta);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  for (int k = 0, i = blockIdx.x * ROWS_PER_CTA;
+       (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) {
+    double rowThreadSum = 0.0;
+    for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) {
+      rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]);
+    }
+
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      rowThreadSum += tile32.shfl_down(rowThreadSum, offset);
+    }
+
+    if (tile32.thread_rank() == 0) {
+      atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum);
+    }
+  }
+
+  cg::sync(cta);
+
+  if (threadIdx.x < ROWS_PER_CTA) {
+    cg::thread_block_tile<ROWS_PER_CTA> tile8 =
+        cg::tiled_partition<ROWS_PER_CTA>(cta);
+    double temp_sum = 0.0;
+
+    int k = threadIdx.x;
+
+    for (int i = k + (blockIdx.x * ROWS_PER_CTA);
+         (k < ROWS_PER_CTA) && (i < N_ROWS);
+         k += ROWS_PER_CTA, i += ROWS_PER_CTA) {
+      double dx = b_shared[i % (ROWS_PER_CTA + 1)];
+      dx /= A[i * N_ROWS + i];
+
+      x_new[i] = (x_shared[i] + dx);
+      temp_sum += fabs(dx);
+    }
+
+    for (int offset = tile8.size() / 2; offset > 0; offset /= 2) {
+      temp_sum += tile8.shfl_down(temp_sum, offset);
+    }
+
+    if (tile8.thread_rank() == 0) {
+      atomicAdd(sum, temp_sum);
+    }
+  }
+}
+
+// Thread block size for finalError kernel should be multiple of 32
+static __global__ void finalError(double const *x, double *g_sum) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  extern __shared__ double warpSum[];
+  double sum = 0.0;
+
+  int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) {
+    double d = x[i] - 1.0;
+    sum += fabs(d);
+  }
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+    sum += tile32.shfl_down(sum, offset);
+  }
+
+  if (tile32.thread_rank() == 0) {
+    warpSum[threadIdx.x / warpSize] = sum;
+  }
+
+  cg::sync(cta);
+
+  double blockSum = 0.0;
+  if (threadIdx.x < (blockDim.x / warpSize)) {
+    blockSum = warpSum[threadIdx.x];
+  }
+
+  if (threadIdx.x < 32) {
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      blockSum += tile32.shfl_down(blockSum, offset);
+    }
+    if (tile32.thread_rank() == 0) {
+      atomicAdd(g_sum, blockSum);
+    }
+  }
+}
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
new file mode 100644
index 00000000..9a8e5386
--- /dev/null
+++ b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <eyalroz1@gmx.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// This sample demonstrates Instantiated CUDA Graph Update
+// with Jacobi Iterative Method in 3 different methods:
+// 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with
+// cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() -
+// CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph
+// method
+
+// Jacobi method on a linear system A*x = b,
+// where A is diagonally dominant and the exact solution consists
+// of all ones.
+// The dimension N_ROWS is included in jacobi.h
+
+#include <cuda/api.hpp>
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+#include <numeric>
+#include "jacobi.h"
+
+[[noreturn]] bool die_(const ::std::string& message = "")
+{
+	if (not message.empty()) { ::std::cerr << message << std::endl; }
+	exit(EXIT_FAILURE);
+}
+
+// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1
+// elsewhere. The elements of the right hand side b all equal 2*n, hence the
+// exact solution x to A*x = b is a vector of ones.
+void createLinearSystem(span<float> A, span<double> b)
+{
+	int i, j;
+	for (i = 0; i < N_ROWS; i++) {
+		b[i] = 2.0 * N_ROWS;
+		for (j = 0; j < N_ROWS; j++) A[i * N_ROWS + j] = 1.0;
+		A[i * N_ROWS + i] = N_ROWS + 1.0;
+	}
+}
+
+// Run the Jacobi method for A*x = b on CPU.
+std::pair<int, double> do_jacobi_on_cpu(span<float> A, span<double> b, float convergence_threshold, int max_iterations)
+{
+	auto x_ = std::array<double, N_ROWS>{};
+	span<double> x = { x_.data(), N_ROWS };
+	auto x_new = std::array<double, N_ROWS> { 0 };
+	int k;
+
+	for (k = 0; k < max_iterations; k++) {
+		double sum = 0.0;
+		for (int i = 0; i < N_ROWS; i++) {
+			double temp_dx = b[i];
+			for (int j = 0; j < N_ROWS; j++) temp_dx -= A[i * N_ROWS + j] * x[j];
+			temp_dx /= A[i * N_ROWS + i];
+			x_new[i] += temp_dx;
+			sum += ::std::fabs(temp_dx);
+		}
+		std::copy(x_new.cbegin(), x_new.cend(), x.begin());
+
+		if (sum <= convergence_threshold) break;
+	}
+
+	double sum = std::accumulate(std::begin(x), std::end(x), 0.0,
+		[](double accumulation_so_far, double element) {
+			return accumulation_so_far + ::std::fabs(element - 1.0);
+		} );
+//	sdkStopTimer(&timerCPU);
+
+	report_error_sum("CPU", k+1, sum);
+	std::cout << '\n';
+
+	return {k + 1, sum};
+}
+
+template <computation_method_t Method>
+bool do_gpu_jacobi(
+	const cuda::device_t& device,
+	const cuda::stream_t& stream,
+	span<float  const> A,
+	span<double const> b,
+	span<float      > d_A,
+	span<double     > d_b,
+	float convergence_threshold,
+	int max_iterations,
+	span<double     > d_x,
+	span<double     > d_x_new,
+	span<double     > d_sum,
+	double sum_on_cpu
+)
+{
+	stream.enqueue.memzero(d_x);
+	stream.enqueue.memzero(d_x_new);
+	stream.enqueue.copy(d_A, A);
+	stream.enqueue.copy(d_b, b);
+
+//	sdkCreateTimer(&timerGpu);
+//	sdkStartTimer(&timerGpu);
+
+	std::cout << "Jacobi computation with method " << method_name(Method) << ":\n";
+	double sum = do_jacobi_inner<Method>(device, stream, d_A, d_b, convergence_threshold, max_iterations, d_x, d_x_new, d_sum);
+
+	bool success = ::std::fabs(sum_on_cpu - sum) < convergence_threshold;
+	std::cout << (success ? "PASSED" : "FAILED") << "\n\n";
+	return success;
+}
+
+void report_error_sum(const char* where, int num_iterations, double sum_on_cpu)
+{
+	std::cout << where << " iterations : " << num_iterations << '\n';
+	auto cout_flags (std::cout.flags());
+	std::cout << where << " error : " << std::setprecision(3) << std::scientific << sum_on_cpu << '\n';
+	std::cout.setf(cout_flags);
+}
+
+int main(int argc, char **argv)
+{
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id = (argc > 1) ? ::std::stoi(argv[1]) : cuda::device::default_device_id;
+	auto device = cuda::device::get(device_id);
+
+	auto b = cuda::memory::host::make_unique_span<double>(N_ROWS);
+	auto A = cuda::memory::host::make_unique_span<float>(N_ROWS * N_ROWS);
+
+	createLinearSystem(A, b);
+
+	float convergence_threshold = 1.0e-2;
+	int max_num_iterations = 4 * N_ROWS * N_ROWS;
+
+	// create timer
+//	StopWatchInterface *timerCPU = NULL, *timerGpu = NULL;
+//	sdkCreateTimer(&timerCPU);
+
+//	sdkStartTimer(&timerCPU);
+	auto num_iterations_and_sum = do_jacobi_on_cpu(A, b, convergence_threshold, max_num_iterations);
+	auto num_iterations = num_iterations_and_sum.first;
+	auto sum_on_cpu = num_iterations_and_sum.second;
+
+//	printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU));
+
+	auto d_A = cuda::memory::device::make_unique_span<float>(device, N_ROWS * N_ROWS);
+	auto d_b = cuda::memory::device::make_unique_span<double>(device, N_ROWS);
+	auto d_x = cuda::memory::device::make_unique_span<double>(device, N_ROWS);
+	auto d_x_new = cuda::memory::device::make_unique_span<double>(device, N_ROWS);
+	auto d_sum = cuda::memory::device::make_unique_span<double>(device, 1);
+	auto stream = cuda::stream::create(device, cuda::stream::async);
+
+	do_gpu_jacobi<graph_with_set_kernel_params>(
+		device,	stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_();
+	do_gpu_jacobi<graph_with_exec_update>(
+		device,	stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_();
+	do_gpu_jacobi<non_graph_gpu>(
+		device,	stream, A, b, d_A, d_b, convergence_threshold, num_iterations, d_x, d_x_new, d_sum, sum_on_cpu) or die_();
+
+	std::cout << "SUCCESS\n";
+}
+
diff --git a/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu b/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu
new file mode 100644
index 00000000..0715bddd
--- /dev/null
+++ b/examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu
@@ -0,0 +1,508 @@
+/*
+ * Original code Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <eyalroz1@gmx.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cooperative_groups.h>
+#include <cuda/api.hpp>
+#include <vector>
+#include <iostream>
+#include <numeric>
+
+#if __cplusplus >= 202001L
+using span = std::span;
+#else
+using cuda::span;
+#endif
+
+namespace cg = cooperative_groups;
+
+#define THREADS_PER_BLOCK 512
+#define GRAPH_LAUNCH_ITERATIONS 3
+
+__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, size_t outputSize)
+{
+	__shared__ double tmp[THREADS_PER_BLOCK];
+
+	cg::thread_block cta = cg::this_thread_block();
+	size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
+
+	double temp_sum = 0.0;
+	for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {
+		temp_sum += (double) inputVec[i];
+	}
+	tmp[cta.thread_rank()] = temp_sum;
+
+	cg::sync(cta);
+
+	cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+	double beta = temp_sum;
+	double temp;
+
+	for (int i = tile32.size() / 2; i > 0; i >>= 1) {
+		if (tile32.thread_rank() < i) {
+			temp = tmp[cta.thread_rank() + i];
+			beta += temp;
+			tmp[cta.thread_rank()] = beta;
+		}
+		cg::sync(tile32);
+	}
+	cg::sync(cta);
+
+	if (cta.thread_rank() == 0 && blockIdx.x < outputSize) {
+		beta = 0.0;
+		for (int i = 0; i < cta.size(); i += tile32.size()) {
+			beta += tmp[i];
+		}
+		outputVec[blockIdx.x] = beta;
+	}
+}
+
+__global__ void reduceFinal(double *inputVec, double *result, size_t inputSize)
+{
+	__shared__ double tmp[THREADS_PER_BLOCK];
+
+	cg::thread_block cta = cg::this_thread_block();
+	size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
+
+	double temp_sum = 0.0;
+	for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {
+		temp_sum += (double) inputVec[i];
+	}
+	tmp[cta.thread_rank()] = temp_sum;
+
+	cg::sync(cta);
+
+	cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+	// do reduction in shared mem
+	if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) {
+		tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256];
+	}
+
+	cg::sync(cta);
+
+	if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) {
+		tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128];
+	}
+
+	cg::sync(cta);
+
+	if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) {
+		tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64];
+	}
+
+	cg::sync(cta);
+
+	if (cta.thread_rank() < 32) {
+		// Fetch final intermediate sum from 2nd warp
+		if (blockDim.x >= 64) temp_sum += tmp[cta.thread_rank() + 32];
+		// Reduce final warp using shuffle
+		for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+			temp_sum += tile32.shfl_down(temp_sum, offset);
+		}
+	}
+	// write result for this block to global mem
+	if (cta.thread_rank() == 0) result[0] = temp_sum;
+}
+
+void init_input(cuda::span<float> a) {
+	auto generator = []() {  return(rand() & 0xFF) / (float)RAND_MAX; };
+	std::generate_n(a.data(), a.size(), generator);
+}
+
+void myRealHostNodeCallback(char const *graph_construction_mode, double result)
+{
+	std::cout << "Host callback in graph constructed by " << graph_construction_mode << ": result = " << result << std::endl;
+	result = 0.0;  // reset the result
+}
+
+void CUDART_CB myHostNodeCallback(void *type_erased_data)
+{
+	auto *data = reinterpret_cast<std::pair<const char*, double*>*>(type_erased_data);
+	auto graph_construction_mode = data->first;
+	auto result = data->second;
+	myRealHostNodeCallback(graph_construction_mode, *result);
+}
+
+void report_attempt(const char* attempt_kind, const char* how_created)
+{
+	std::cout << '\n'
+			  << "Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << '\n'
+			  << "----------------------------------------------------------------\n";
+}
+
+void use(const cuda::device_t &device, const cuda::graph::template_t &graph, const char* how_created)
+{
+	report_attempt("use", how_created);
+	std::cout << "Number of graph nodes = " << graph.num_nodes() << '\n';
+
+	auto instance = cuda::graph::instantiate(graph);
+
+	auto cloned = graph.clone();
+	auto cloned_graph_instance = cuda::graph::instantiate(graph);
+
+	auto stream_for_graph = cuda::stream::create(device, cuda::stream::async);
+
+	for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
+		std::cout
+			<< "Launching an instance of the original graph: launch "
+			<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl;
+		instance.launch(stream_for_graph);
+	}
+
+	for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
+		std::cout
+			<< "Launching an instance of the cloned graph: launch "
+			<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl;
+		cloned_graph_instance.launch(stream_for_graph);
+	}
+	std::cout << std::endl;
+	stream_for_graph.synchronize();
+}
+
+void cudaGraphsManual(
+	const cuda::device_t& device,
+	span<float>  inputVec_h,
+	span<float>  inputVec_d,
+	span<double> outputVec_d,
+	span<double> result_d)
+{
+	const char* graph_construction_mode = "explicit node and edge insertion calls";
+	report_attempt("construction", graph_construction_mode);
+	double result_h = 0.0;
+
+	using node_kind_t = cuda::graph::node::kind_t;
+	auto graph = cuda::graph::create();
+
+	auto memcpy_node = [&] {
+		cuda::memory::copy_parameters_t<3> copy_params;
+		// TODO: Have the copy_parameters_t class be more like a builder.
+		// And - accept sizes with dimensionality upto the copy params dimensionality
+		copy_params.set_source<float>(inputVec_h);
+		copy_params.set_destination<float>(inputVec_d);
+		// TODO: Need to tweak the copy parameters class so that these next few lines are not necessary;
+		// and to make sure we don't use the params without everything necessary being set
+		copy_params.set_extent<float>(inputVec_h.size());
+		copy_params.clear_offsets();
+		copy_params.clear_rest();
+		auto current_context = cuda::context::current::get();
+		return graph.insert.node<node_kind_t::memory_copy>(current_context, copy_params);
+	}();
+
+	auto memset_node = [&] {
+		cuda::graph::node::parameters_t<node_kind_t::memory_set> params;
+		params.value = 0;
+		params.width_in_bytes = 4;
+		params.region = outputVec_d;
+		return graph.insert.node<node_kind_t::memory_set>(params);
+	}();
+
+	auto reduce_node = [&] {
+		auto reduce_kernel = cuda::kernel::get(device, reduce);
+		auto launch_config = cuda::launch_config_builder()
+			.grid_size(outputVec_d.size())
+			.block_size(THREADS_PER_BLOCK)
+			.build();
+		auto kernel_arg_pointers = cuda::graph::make_kernel_argument_pointers(
+			inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());
+		auto kernel_node_args = cuda::graph::make_launch_primed_kernel(reduce_kernel, launch_config, kernel_arg_pointers);
+		return graph.insert.node<node_kind_t::kernel_launch>(kernel_node_args);
+	}();
+
+	graph.insert.edge(memcpy_node, reduce_node);
+	graph.insert.edge(memset_node, reduce_node);
+
+	auto memset_result_node = [&] {
+		cuda::graph::node::parameters_t<node_kind_t::memory_set> params;
+		params.value = 0;
+		params.width_in_bytes = 4;
+		params.region = result_d;
+		return graph.insert.node<node_kind_t::memory_set>(params);
+	}();
+
+	auto reduce_final_node = [&] {
+		auto kernel = cuda::kernel::get(device, reduceFinal);
+		auto launch_config = cuda::launch_config_builder()
+			.grid_size(1)
+			.block_size(THREADS_PER_BLOCK)
+			.build();
+		auto arg_ptrs = cuda::graph::make_kernel_argument_pointers(outputVec_d.data(), result_d.data(), outputVec_d.size());
+		return graph.insert.node<node_kind_t::kernel_launch>(kernel, launch_config, arg_ptrs);
+	}();
+
+	graph.insert.edge(reduce_node, reduce_final_node);
+	graph.insert.edge(memset_result_node, reduce_final_node);
+
+	auto memcpy_result_node = [&] {
+		cuda::memory::copy_parameters_t<3> copy_params;
+		// TODO: Have the copy_parameters_t class be more like a builder.
+		// And - accept sizes with dimensionality upto the copy params dimensionality
+		copy_params.set_source<double>(result_d);
+		copy_params.set_destination<double>(&result_h, 1);
+		copy_params.set_extent<double>(1);
+		copy_params.clear_offsets();
+		copy_params.clear_rest();
+		return graph.insert.node<node_kind_t::memory_copy>(copy_params);
+	}();
+
+	graph.insert.edge(reduce_final_node, memcpy_result_node);
+
+	auto host_function_data = std::make_pair(graph_construction_mode, &result_h);
+	auto host_function_node = graph.insert.node<node_kind_t::host_function_call>(myHostNodeCallback, &host_function_data);
+
+	graph.insert.edge(memcpy_result_node, host_function_node);
+
+	use(device, graph, graph_construction_mode);
+}
+
+void cudaGraphsManualWithBuilders(
+	const cuda::device_t& device,
+	span<float>  inputVec_h,
+	span<float>  inputVec_d,
+	span<double> outputVec_d,
+	span<double> result_d)
+{
+	const char* graph_construction_mode = "use of node builders and explicit edge insertions";
+	report_attempt("construction", graph_construction_mode);
+	double result_h = 0.0;
+
+	using node_kind_t = cuda::graph::node::kind_t;
+	auto graph = cuda::graph::create();
+
+	// what about building via the graph object?
+	cuda::graph::node::builder_t builder;
+
+	std::cout << "Building a memcpy node" << std::endl;
+
+	// TODO: Consider having builder_t::memory_copy , builder_t::memory_set etc.
+	auto memcpy_node = builder.kind<node_kind_t::memory_copy>()
+		.source(inputVec_h)
+		.destination(inputVec_d)
+		// the following is not necessary - it is deduced
+		// .extent<float>(inputVec_h.size())
+		// No need to clear anything - that's taken care of by default
+		// .copy_params.clear_offsets();
+		// .copy_params.clear_rest();
+		// No need to set the context - we default to the current context!
+		// .context(cuda::context::current::get())
+		.build_within(graph);
+
+	std::cout << "Building a memset node" << std::endl;
+
+	auto memset_node = builder.kind<node_kind_t::memory_set>()
+		.region(inputVec_d)
+		.value<float>(0)
+		.build_within(graph);
+
+	std::cout << "Building a kernel launch node" << std::endl;
+
+
+	auto wrapped_reduce_kernel = cuda::kernel::get(device, reduce);
+	auto reduce_node = [&]{
+		auto launch_config = cuda::launch_config_builder()
+			.grid_size(outputVec_d.size())
+			.block_size(THREADS_PER_BLOCK)
+			.build();
+
+		return builder.kind<node_kind_t::kernel_launch>()
+			.kernel(wrapped_reduce_kernel)
+			.launch_configuration(launch_config)
+			.arguments(inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size())
+				// Also need to support add_argument called multiple times
+			.build_within(graph);
+	}();
+
+	graph.insert.edge(memcpy_node, reduce_node);
+	graph.insert.edge(memset_node, reduce_node);
+
+	std::cout << "Building a memset node" << std::endl;
+
+	auto memset_result_node = builder.kind<node_kind_t::memory_set>()
+		.region(result_d)
+		.value<float>(0)
+		.build_within(graph);
+
+
+	std::cout << "Building a kernel launch node" << std::endl;
+
+	auto final_reduce_launch_config = cuda::launch_config_builder()
+		.grid_size(1)
+		.block_size(THREADS_PER_BLOCK)
+		.build();
+
+	auto wrapped_reduce_final_kernel = cuda::kernel::get(device, reduceFinal);
+	auto reduce_final_node = builder.kind<node_kind_t::kernel_launch>()
+		.kernel(wrapped_reduce_final_kernel)
+		.launch_configuration(final_reduce_launch_config)
+		.arguments(outputVec_d.data(), result_d.data(), outputVec_d.size())
+		// Also need to support add_argument called multiple times
+		.build_within(graph);
+
+	graph.insert.edge(reduce_node, reduce_final_node);
+	graph.insert.edge(memset_result_node, reduce_final_node);
+
+	std::cout << "Building a memcpy node" << std::endl;
+
+	auto memcpy_result_node = builder.kind<node_kind_t::memory_copy>()
+		.source(result_d)
+		.destination(cuda::span<double>{&result_h,1})
+		.build_within(graph);
+
+	graph.insert.edge(reduce_final_node, memcpy_result_node);
+
+	auto host_function_data = std::make_pair(graph_construction_mode, &result_h);
+
+	std::cout << "Building a host function node" << std::endl;
+
+	auto host_function_node = builder.kind<node_kind_t::host_function_call>()
+		.argument(&host_function_data)
+		.function(myHostNodeCallback)
+		.build_within(graph);
+
+	graph.insert.edge(memcpy_result_node, host_function_node);
+
+	use(device, graph, graph_construction_mode);
+}
+
+void cudaGraphsUsingStreamCapture(
+	const cuda::device_t& device,
+	span<float>  inputVec_h,
+	span<float>  inputVec_d,
+	span<double> outputVec_d,
+	span<double> result_d)
+{
+	const char* graph_construction_mode = "stream capture";
+	report_attempt("construction", graph_construction_mode);
+	double result_h = 0.0;
+
+	using cuda::stream::async;
+	auto stream_1 = cuda::stream::create(device, async);
+	auto stream_2 = cuda::stream::create(device, async);
+	auto stream_3 = cuda::stream::create(device, async);
+
+	auto fork_stream_event = cuda::event::create(device);
+	auto reduce_output_memset_event = cuda::event::create(device);
+	auto final_result_memset_event = cuda::event::create(device);
+
+	stream_1.begin_capture(cuda::stream::capture::mode_t::global);
+
+	stream_1.enqueue.event(fork_stream_event);
+	stream_2.enqueue.wait(fork_stream_event);
+	stream_3.enqueue.wait(fork_stream_event);
+
+	stream_1.enqueue.copy(inputVec_d, inputVec_h);
+	stream_2.enqueue.memzero(outputVec_d);
+
+	stream_2.enqueue.event(reduce_output_memset_event);
+	stream_3.enqueue.memzero(result_d);
+	stream_3.enqueue.event(final_result_memset_event);
+
+	stream_1.enqueue.wait(reduce_output_memset_event);
+
+	auto launch_config = cuda::launch_config_builder()
+		.grid_dimensions(outputVec_d.size())
+		.block_dimensions(THREADS_PER_BLOCK)
+		.build();
+
+	stream_1.enqueue.kernel_launch(reduce, launch_config,
+		inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());
+
+	stream_1.enqueue.wait(final_result_memset_event);
+
+	launch_config =  cuda::launch_config_builder()
+		.grid_dimensions(1)
+		.block_dimensions(THREADS_PER_BLOCK)
+		.build();
+	stream_1.enqueue.kernel_launch(reduceFinal, launch_config,
+		outputVec_d.data(), result_d.data(), outputVec_d.size());
+
+	stream_1.enqueue.copy(&result_h, result_d);
+
+	auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); };
+	stream_1.enqueue.host_invokable(callback);
+
+	auto graph = stream_1.end_capture();
+
+	use(device, graph, graph_construction_mode);
+}
+
+[[noreturn]] bool die_(const ::std::string& message)
+{
+	::std::cerr << message << "\n";
+	exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+	size_t size { 1 << 24 }; // number of elements to reduce
+	size_t maxBlocks { 512 };
+
+	if (cuda::device::count() == 0) {
+		die_("No CUDA devices on this system");
+	}
+
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id = (argc > 1) ?
+		::std::stoi(argv[1]) : cuda::device::default_device_id;
+
+	auto device = cuda::device::get(device_id);
+
+	std::cout
+		<< size << " elements\n"
+		<< "threads per block  = " << THREADS_PER_BLOCK << '\n'
+		<< "Graph Launch iterations = " << GRAPH_LAUNCH_ITERATIONS << '\n'
+		<< std::flush;
+
+	auto inputVec_h = cuda::memory::host::make_unique_span<float>(size);
+	auto inputVec_d = cuda::memory::device::make_unique_span<float>(device, size);
+	auto outputVec_d = cuda::memory::device::make_unique_span<double>(device, maxBlocks);
+	auto result_d = cuda::memory::device::make_unique_span<double>(device, 1);
+
+	init_input(inputVec_h);
+
+	auto result_verification = ::std::accumulate(
+#if __cplusplus >= 201712L
+		::std::execution::par_unseq,
+#endif
+		inputVec_h.begin(), inputVec_h.end(), 0.0);
+	std::cout << "Expected result = " << result_verification << '\n';
+
+	device.synchronize();
+
+	cudaGraphsManual(device, inputVec_h, inputVec_d, outputVec_d, result_d);
+
+	cudaGraphsUsingStreamCapture(device, inputVec_h, inputVec_d, outputVec_d, result_d);
+
+	device.synchronize();
+	std::flush(std::cout);
+
+	cudaGraphsManualWithBuilders(device, inputVec_h, inputVec_d, outputVec_d, result_d);
+
+	std::cout << "\n\nSUCCESS\n";
+}
diff --git a/examples/other/jitify/string_view.hpp b/examples/other/jitify/string_view.hpp
index c0fa2b6e..bc1419d2 100644
--- a/examples/other/jitify/string_view.hpp
+++ b/examples/other/jitify/string_view.hpp
@@ -526,7 +526,7 @@ inline nssv_constexpr std::size_t length( CharT * s, std::size_t result = 0 )
 template< typename CharT >
 inline nssv_constexpr14 std::size_t length( CharT * s )
 {
-	std::size_t result = 0;
+	::std::size_t result = 0;
 	while ( *s++ != '\0' )
 	{
 		++result;
@@ -570,7 +570,7 @@ class basic_string_view
 	typedef const_pointer iterator;
 	typedef const_pointer const_iterator;
 	typedef std::reverse_iterator< const_iterator > reverse_iterator;
-	typedef	std::reverse_iterator< const_iterator > const_reverse_iterator;
+	typedef	::std::reverse_iterator< const_iterator > const_reverse_iterator;
 
 	typedef std::size_t     size_type;
 	typedef std::ptrdiff_t  difference_type;
@@ -1042,14 +1042,14 @@ class basic_string_view
 
 #else
 
-	std::basic_string<CharT, Traits>
+	::std::basic_string<CharT, Traits>
 	to_string() const
 	{
 		return std::basic_string<CharT, Traits>( begin(), end() );
 	}
 
 	template< class Allocator >
-	std::basic_string<CharT, Traits, Allocator>
+	::std::basic_string<CharT, Traits, Allocator>
 	to_string( Allocator const & a ) const
 	{
 		return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
@@ -1129,12 +1129,12 @@ nssv_constexpr bool operator==(
 template< class CharT, class Traits>
 nssv_constexpr bool operator==(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator==(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
 
@@ -1155,12 +1155,12 @@ nssv_constexpr bool operator!=(
 template< class CharT, class Traits>
 nssv_constexpr bool operator!=(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return !( lhs == rhs ); }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator!=(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return !( lhs == rhs ); }
 
@@ -1181,12 +1181,12 @@ nssv_constexpr bool operator<(
 template< class CharT, class Traits>
 nssv_constexpr bool operator<(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return lhs.compare( rhs ) < 0; }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator<(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return rhs.compare( lhs ) > 0; }
 
@@ -1207,12 +1207,12 @@ nssv_constexpr bool operator<=(
 template< class CharT, class Traits>
 nssv_constexpr bool operator<=(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return lhs.compare( rhs ) <= 0; }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator<=(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return rhs.compare( lhs ) >= 0; }
 
@@ -1233,12 +1233,12 @@ nssv_constexpr bool operator>(
 template< class CharT, class Traits>
 nssv_constexpr bool operator>(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return lhs.compare( rhs ) > 0; }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator>(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return rhs.compare( lhs ) < 0; }
 
@@ -1259,12 +1259,12 @@ nssv_constexpr bool operator>=(
 template< class CharT, class Traits>
 nssv_constexpr bool operator>=(
 	basic_string_view<CharT, Traits> lhs,
-	std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+	::std::basic_string<CharT, Traits> rhs ) nssv_noexcept
 { return lhs.compare( rhs ) >= 0; }
 
 template< class CharT, class Traits>
 nssv_constexpr bool operator>=(
-	std::basic_string<CharT, Traits> rhs,
+	::std::basic_string<CharT, Traits> rhs,
 	basic_string_view<CharT, Traits> lhs ) nssv_noexcept
 { return rhs.compare( lhs ) <= 0; }
 
@@ -1414,7 +1414,7 @@ Stream & write_to_stream( Stream & os, View const & sv )
 template< class CharT, class Traits >
 std::basic_ostream<CharT, Traits> &
 operator<<(
-	std::basic_ostream<CharT, Traits>& os,
+	::std::basic_ostream<CharT, Traits>& os,
 	basic_string_view <CharT, Traits> sv )
 {
 	return detail::write_to_stream( os, sv );
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
index 2024b799..51af11f0 100644
--- a/src/cuda/api.hpp
+++ b/src/cuda/api.hpp
@@ -56,6 +56,13 @@
 #include "api/external.hpp"
 #endif // CUDA_VERSION >= 10000
 
+#if CUDA_VERSION >= 10000
+#include "api/graph/node.hpp"
+#include "api/graph/typed_node.hpp"
+#include "api/graph/template.hpp"
+#include "api/graph/instance.hpp"
+#endif // CUDA_VERSION >= 10000
+
 #include "api/multi_wrapper_impls/pointer.hpp"
 #include "api/multi_wrapper_impls/array.hpp"
 #include "api/multi_wrapper_impls/event.hpp"
@@ -72,7 +79,9 @@
 #include "api/multi_wrapper_impls/module.hpp"
 #include "api/multi_wrapper_impls/ipc.hpp"
 #include "api/multi_wrapper_impls/launch_configuration.hpp"
-
 #include "api/launch_config_builder.hpp"
-
+#if CUDA_VERSION >= 10000
+#include "api/multi_wrapper_impls/graph.hpp"
+#include "api/graph/node_builder.hpp"
+#endif // CUDA_VERSION >= 10000
 #endif // CUDA_API_WRAPPERS_HPP_
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
index 350c877d..e744a354 100644
--- a/src/cuda/api/context.hpp
+++ b/src/cuda/api/context.hpp
@@ -260,6 +260,12 @@ class context_t {
 	 * `my_context.memory::foo()`, think of it as a `my_dev::memory::foo()`.
 	 */
 	class global_memory_type {
+#if CUDA_VERSION >= 11000
+	public: // data types
+		using execution_graph_related_attribute_t = CUgraphMem_attribute;
+#endif
+
+
 	protected: // data members
 		const device::id_t device_id_;
 		const context::handle_t context_handle_;
@@ -325,6 +331,42 @@ class context_t {
 			CAW_SET_SCOPE_CONTEXT(context_handle_);
 			return context::detail_::free_memory(context_handle_);
 		}
+
+#if CUDA_VERSION >= 11000
+		/**
+		 * @note CUDA devices use special memory allocation for execution graphs - not managed via
+		 * cuMemAlloc and friends. These methods allow for controlling and querying some aspects of this
+		 * memory.
+		 */
+		///@{
+		void free_unused_execution_graph_memory() const
+		{
+			auto status = cuDeviceGraphMemTrim(device_id_);
+			throw_if_error_lazy(status,
+				"Trimming memory used for CUDA execution graphs on " + device::detail_::identify(device_id_));
+		}
+
+		/**
+		 * @param attribute See the documentation for @ref CUgraphMem_attribute for the possible attributes
+		 */
+		size_t get_execution_graph_related_attribute(execution_graph_related_attribute_t attribute) const
+		{
+			cuuint64_t result;
+			auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
+			throw_if_error_lazy(status, "Failed obtaining an execution-graph-related memory attribute for "
+										+ device::detail_::identify(device_id_));
+			return result;
+		}
+
+		void reset_execution_graph_usage_high_watermark() const
+		{
+			cuuint64_t value_{0};
+			auto status = cuDeviceSetGraphMemAttribute(device_id_, CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, &value_);
+			throw_if_error_lazy(status, "Failed setting an execution-graph-related memory attribute for "
+										+ device::detail_::identify(device_id_));
+		}
+		///@}
+#endif
 	}; // class global_memory_type
 
 
diff --git a/src/cuda/api/detail/for_each_argument.hpp b/src/cuda/api/detail/for_each_argument.hpp
new file mode 100644
index 00000000..06b33a78
--- /dev/null
+++ b/src/cuda/api/detail/for_each_argument.hpp
@@ -0,0 +1,26 @@
+#ifndef CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP
+#define CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP
+
+#include <utility>
+
+namespace cuda {
+
+namespace detail_ {
+
+template <class F>
+void for_each_argument(F) { }
+
+template <class F, class... Args>
+void for_each_argument(F f, Args&&... args) {
+	using arrT = int[];
+	static_cast<void>(arrT{(f(::std::forward<Args>(args)), 0)...});
+// This:
+//	[](...){}((f(::std::forward<Args>(args)), 0)...);
+// doesn't guarantee execution order
+}
+
+} // namespaced detail_
+
+} // namespaced cuda
+
+#endif //CUDA_API_WRAPPERS_FOR_EACH_ARGUMENT_HPP
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index e5a85038..68cae7ac 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -140,6 +140,74 @@ class device_t {
 	using properties_t = device::properties_t;
 	using attribute_value_t = device::attribute_value_t;
 
+#if CUDA_VERSION >= 11000
+	class global_memory_type : public context_t::global_memory_type {
+
+		/**
+		 * Obtains the amount of memory used for execution-graph-related resource on
+		 * this device.
+		 *
+		 * @param reserved
+		 *     when false, the memory in actual use at the moment is returned; when true,
+		 *     the amount returned is the total reserved for use by the "asynchronous
+		 *     allocator" used for execution graphs
+		 * @param high_watermark
+		 *     when false, the current amount of memory is returned; when true, the
+		 *     returned amount is the "high watermark" of device memory use on the device
+		 *     since the last reset.
+		 */
+		size_t amount_used_for_graphs(
+			bool reserved = false,
+			bool high_watermark = false) const
+		{
+			auto attribute = reserved ?
+				(high_watermark ? CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) :
+				(high_watermark ? CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_USED_MEM_HIGH);
+			size_t result;
+			auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
+			throw_if_error_lazy(status,
+				"Obtaining the current amount of memory used for execution graphs on "
+				+ device::detail_::identify(device_id_));
+			return result;
+		}
+
+		/**
+		 * Releases any memory allocated for execution graph resources back to the OS
+		 */
+		void free_unused_graph_memory() const
+		{
+			auto status = cuDeviceGraphMemTrim(device_id_);
+			throw_if_error_lazy(status, "Freeing unused execution graph memory on "
+				+ device::detail_::identify(device_id_));
+		}
+
+		/**
+		 * Obtains the amount of memory used for execution-graph-related resource on
+		 * this device.
+		 *
+		 * @param high_watermark
+		 *     when false, and by default, the current amount of memory used is
+		 *     returned; when true, the returned amount is the "high watermark" of
+		 *     device memory usage for graph resources since the last reset.
+		 */
+		size_t amount_used_for_graphs(bool high_watermark = false) const
+		{
+			size_t result;
+			auto status = cuDeviceGetGraphMemAttribute(
+				device_id_,
+				high_watermark ?
+				CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT :
+				CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
+				&result);
+			throw_if_error_lazy(status,
+				"Obtaining the current amount of memory used for execution graphs on "
+				+ device::detail_::identify(device_id_));
+			return result;
+		}
+
+	};
+#endif // CUDA_VERSION >= 11000
+
 	/**
 	 *
 	 * @note The memory proxy regards the device's primary context.
diff --git a/src/cuda/api/external.hpp b/src/cuda/api/external.hpp
index db5dfc9a..d4173f51 100644
--- a/src/cuda/api/external.hpp
+++ b/src/cuda/api/external.hpp
@@ -9,6 +9,8 @@
 #ifndef CUDA_API_WRAPPERS_EXTERNAL_HPP_
 #define CUDA_API_WRAPPERS_EXTERNAL_HPP_
 
+#if CUDA_VERSION >= 10000
+
 #include "memory.hpp"
 #include "unique_region.hpp"
 
@@ -187,4 +189,6 @@ inline unique_region map(const resource_t& resource)
 } // namespace memory
 } // namespace cuda
 
+#endif // CUDA_VERSION >= 10000
+
 #endif // CUDA_API_WRAPPERS_EXTERNAL_HPP_
diff --git a/src/cuda/api/graph/identify.hpp b/src/cuda/api/graph/identify.hpp
new file mode 100644
index 00000000..73290aad
--- /dev/null
+++ b/src/cuda/api/graph/identify.hpp
@@ -0,0 +1,100 @@
+/**
+ * @file
+ *
+ */
+#pragma once
+#ifndef CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_
+#define CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_
+
+#if CUDA_VERSION >= 10000
+
+#include "../error.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace graph {
+
+namespace template_ {
+
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle)
+{
+	return "execution graph template " + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(handle_t handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + device::detail_::identify(device_id);
+}
+/*
+
+inline ::std::string identify(handle_t handle, context::handle_t context_handle)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle);
+}
+
+inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle, device_id);
+}
+*/
+
+} // namespace detail_
+
+} // namespace template_
+
+namespace instance {
+
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle)
+{
+	return "execution graph instance " + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(handle_t handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + device::detail_::identify(device_id);
+}
+
+inline ::std::string identify(handle_t handle, context::handle_t context_handle)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle);
+}
+
+inline ::std::string identify(handle_t handle, context::handle_t context_handle, device::id_t device_id)
+{
+	return identify(handle) + " on " + context::detail_::identify(context_handle, device_id);
+}
+
+} // namespace detail_
+
+} // namespace instance
+
+namespace node {
+
+namespace detail_ {
+
+inline ::std::string identify(handle_t handle)
+{
+	return ::std::string("node with handle ") + ::cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(handle_t node_handle, template_::handle_t graph_template_handle)
+{
+	return identify(node_handle) + " in " + template_::detail_::identify(graph_template_handle);
+}
+
+} // namespace detail_
+
+} // namespace node
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif // CUDA_GRAPH_API_WRAPPERS_ERROR_HPP_
diff --git a/src/cuda/api/graph/instance.hpp b/src/cuda/api/graph/instance.hpp
new file mode 100644
index 00000000..fe8179c3
--- /dev/null
+++ b/src/cuda/api/graph/instance.hpp
@@ -0,0 +1,504 @@
+/**
+ * @file
+ *
+ * @brief A CUDA execution graph instance wrapper class and some
+ * associated definitions.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_INSTANCE_HPP
+#define CUDA_API_WRAPPERS_INSTANCE_HPP
+
+#if CUDA_VERSION >= 10000
+
+#include "node.hpp"
+#include "identify.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+class stream_t;
+
+namespace graph {
+
+class instance_t;
+
+///@endcond
+class template_t;
+///@endcond
+
+namespace instance {
+
+using update_status_t = CUgraphExecUpdateResult;
+
+namespace update_status {
+
+enum named_t : ::std::underlying_type<update_status_t>::type {
+	success                                          = CU_GRAPH_EXEC_UPDATE_SUCCESS,
+	failure_for_unexpected_reason                    = CU_GRAPH_EXEC_UPDATE_ERROR,
+	topology_has_changed                             = CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED,
+	node_type_has_changed                            = CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED,
+	kernel_node_function_has_changed                 = CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED,
+	unsupported_kind_of_parameter_change             = CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED,
+	unsupported_aspect_of_node                       = CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED,
+#if CUDA_VERSION >= 11000
+	unsupported_kind_of_kernel_node_function_change  = CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE,
+	unsupported_kind_of_node_attributes_change       = CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED,
+#endif // CUDA_VERSION >= 11000
+};
+
+constexpr inline bool operator==(const update_status_t &lhs, const named_t &rhs) noexcept { return lhs == static_cast<update_status_t>(rhs); }
+constexpr inline bool operator!=(const update_status_t &lhs, const named_t &rhs) noexcept { return lhs != static_cast<update_status_t>(rhs); }
+constexpr inline bool operator==(const named_t &lhs, const update_status_t &rhs) noexcept { return static_cast<update_status_t>(lhs) == rhs; }
+constexpr inline bool operator!=(const named_t &lhs, const update_status_t &rhs) noexcept { return static_cast<update_status_t>(lhs) != rhs; }
+
+namespace detail_ {
+
+const char *const descriptions[] = {
+	"success",
+	"failure for an unexpected reason described in the return value of the function",
+	"topology has changed",
+	"node type has changed",
+	"kernel node function has changed",
+	"parameters changed in an unsupported way",
+	"something about the node is not supported",
+	"unsupported kind of kernel node function change",
+	"unsupported kind of node attributes change"
+};
+
+inline bool is_node_specific(update_status_t update_status)
+{
+	return
+		(update_status != success) and
+		(update_status != failure_for_unexpected_reason) and
+		(update_status != topology_has_changed) and
+		(update_status != unsupported_kind_of_parameter_change);
+}
+
+} // namespace detail_
+
+} // namespace update_status
+
+namespace detail_ {
+
+using flags_t = cuuint64_t;
+
+inline const char *describe(instance::update_status_t update_status)
+{
+	return instance::update_status::detail_::descriptions[update_status];
+}
+
+inline ::std::string describe(
+	instance::update_status_t  update_status,
+	node::handle_t             node_handle,
+	template_::handle_t        graph_template_handle);
+
+} // namespace detail_
+
+} // namespace instance
+
+
+/**
+ * @brief enqueues the execution of a execution-graph instance via a stream
+ *
+ * @note recall that graph execution is not serialized on the stream, i.e. one could say that the execution is not
+ * restricted to that single stream; the relation to the stream is that no graph node will be executed before the
+ * currently-enqueued work is concluded, and that no further work on the stream will proceed until the graph is fully
+ * executed.
+ *
+ * @note Only one execution graph instance may be executing at a time. One can, however, execute the same graph
+ * concurrently multiple times, but creating multiple instances of the same graph template (@ref template_t).
+ * It is possible to execute/launch the same instance multiple times _sequentially_.
+ *
+ * @note Take care to have the graph either free any allocations it makes, or use a graph instantiated with
+ * @ref CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH - or the (second) launch of the instance will fail.
+ *
+ * @note An execution graph instance must be "uploaded" to the GPU before it is executed - i.e. have
+ * appropriate physical device resources allocated for it. If this is not done on instantiation, nor
+ * explicitly with @ref instance::upload, calling this function will first upload it, then schedule
+ * it.
+ */
+ void launch(const instance_t& instance, const stream_t& stream);
+
+} // namespace graph
+
+/**
+ * @brief Determine whether the API call returning the specified status had succeeded
+ */
+inline ::std::string describe(graph::instance::update_status_t status)
+{
+	return graph::instance::detail_::describe(status);
+}
+
+::std::string describe(graph::instance::update_status_t update_status, optional<graph::node_t> node);
+
+/**
+ * @brief Determine whether the API call returning the specified status had succeeded
+ */
+inline constexpr bool is_success(graph::instance::update_status_t status)
+{ 
+	return status == graph::instance::update_status::success;
+}
+
+/**
+ * @brief Determine whether the API call returning the specified status had failed
+ */
+constexpr bool is_failure(graph::instance::update_status_t status)  { return not is_success(status); }
+
+namespace graph {
+
+namespace instance {
+
+instance_t wrap(template_::handle_t template_handle, handle_t handle, bool  is_owning) noexcept;
+
+namespace detail_ {
+
+::std::string identify(const instance_t &instance);
+
+} // namespace detail_
+
+// TODO: Add support for reporting errors involving edges
+class update_failure : public ::std::runtime_error {
+public:
+	using parent = ::std::runtime_error;
+
+	update_failure(
+		update_status_t kind,
+		optional<node_t>&& impermissible_node,
+		::std::string&& what_arg) noexcept
+		:
+		parent((what_arg.empty() ? "" : what_arg + ": ") + describe(kind, impermissible_node)),
+		kind_(kind),
+		impermissible_node_(std::move(impermissible_node))
+	{
+		// TODO: Ensure the kind needs a node handle IFF a node handle has been provided
+	}
+
+	update_failure(update_status_t kind, node_t impermissible_node) noexcept
+		: update_failure(kind, optional<node_t>(std::move(impermissible_node)), "")
+	{ }
+
+	update_status_t kind() const noexcept { return kind_; }
+	node_t impermissible_node() const { return impermissible_node_.value(); }
+
+private:
+	update_status_t kind_;
+	optional<node_t> impermissible_node_;
+};
+
+/**
+ * Update the nodes of an execution graph instance with the settings in the nodes
+ * of a compatible execution graph template.
+ *
+ * @param destination An execution graph instance whose node settings are to be updated
+ * @param source An execution graph template which is either the one from which @p destination
+ * was instantiated, or one that is "topologically identical" to the instance, i.e. has the
+ * same types of nodes with the same edges (and no others).
+ */
+void update(const instance_t& destination, const template_t& source);
+
+namespace detail_ {
+
+template <node::kind_t Kind>
+status_t set_node_parameters_nothrow(
+	const instance::handle_t instance_handle,
+	const node::handle_t node_handle,
+	const typename node::detail_::kind_traits<Kind>::raw_parameters_type raw_params)
+{
+	auto raw_params_maybe_ptr = node::detail_::maybe_add_ptr<Kind>(raw_params);
+	return node::detail_::kind_traits<Kind>::instance_setter(instance_handle, node_handle, raw_params_maybe_ptr);
+}
+
+} // namespace detail_
+
+
+template <node::kind_t Kind>
+void set_node_parameters(
+	const instance_t& instance,
+	const node_t& node,
+	const node::parameters_t<Kind> parameters);
+
+} // namespace instance
+
+namespace detail_ {
+
+inline void launch_graph_in_current_context(stream::handle_t stream_handle, instance::handle_t graph_instance_handle)
+{
+	auto status = cuGraphLaunch(graph_instance_handle, stream_handle);
+	throw_if_error_lazy(status, "Trying to launch "
+		+ instance::detail_::identify(graph_instance_handle) + " on " + stream::detail_::identify(stream_handle));
+}
+
+inline void launch(context::handle_t context_handle, stream::handle_t stream_handle, instance::handle_t graph_instance_handle)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
+	launch_graph_in_current_context(stream_handle, graph_instance_handle);
+}
+
+} // namespace detail_
+
+class instance_t {
+public: // data types
+	using handle_type = instance::handle_t;
+
+public: // getters
+	template_::handle_t template_handle() const noexcept { return template_handle_; }
+	handle_type handle() const noexcept	{ return handle_; }
+	bool is_owning() const noexcept { return owning_; }
+
+protected: // constructors
+	instance_t(template_::handle_t template_handle, handle_type handle, bool owning) noexcept
+	: template_handle_(template_handle), handle_(handle), owning_(owning)
+	{ }
+
+public: // constructors & destructor
+	instance_t(const instance_t& other) noexcept = delete;
+
+	instance_t(instance_t&& other) noexcept : instance_t(other.template_handle_, other.handle_, other.owning_)
+	{
+		other.owning_ = false;
+	}
+	~instance_t()
+	{
+		if (owning_) cuGraphExecDestroy(handle_);
+	}
+
+public: // operators
+	instance_t& operator=(const instance_t&) = delete;
+	instance_t& operator=(instance_t&& other) noexcept
+	{
+		::std::swap(template_handle_, other.template_handle_);
+		::std::swap(handle_, other.handle_);
+		::std::swap(owning_, other.owning_);
+		return *this;
+	}
+
+
+public: // friends
+	friend instance_t instance::wrap(template_::handle_t template_handle, handle_type handle, bool  is_owning) noexcept;
+
+public: // non-mutators
+	void update(const template_t& update_source) const
+	{
+		instance::update(*this, update_source);
+	}
+
+	void launch(const stream_t& stream) const
+	{
+		graph::launch(*this, stream);
+	}
+	void upload(const stream_t& stream) const;
+
+
+/*	bool free_previous_allocations_before_relaunch
+#if CUDA_VERSION >= 12000
+		, bool upload_on_instantiation,
+	, bool make_device_launchable,
+#endif // CUDA_VERSION >= 12000
+#if CUDA_VERSION >= 11700
+		, bool use_per_node_priorities
+#endif*/
+
+#if CUDA_VERSION >= 12000
+	bool frees_allocations_before_relaunch() const
+	{
+		instance::detail_::flags_t flags;
+		auto status = cuGraphExecGetFlags (handle_, &flags);
+		throw_if_error_lazy(status, "Obtaining execution graph instance flags");
+		return flags & CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH;
+	}
+
+	bool uses_node_priorities() const
+	{
+		instance::detail_::flags_t flags;
+		auto status = cuGraphExecGetFlags (handle_, &flags);
+		throw_if_error_lazy(status, "Obtaining execution graph instance flags");
+		return flags & CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY;
+	}
+
+#endif
+
+	template <node::kind_t Kind>
+	void set_node_parameters(const node_t& node, node::parameters_t<Kind> new_parameters)
+	{
+		instance::set_node_parameters<Kind>(*this, node, std::move(new_parameters));
+	}
+
+	template <node::kind_t Kind>
+	void set_node_parameters(const node::typed_node_t<Kind>& node)
+	{
+		instance::set_node_parameters<Kind>(*this, node);
+	}
+
+protected:
+	template_::handle_t template_handle_;
+	handle_type handle_;
+	bool owning_;
+};
+
+/**
+ * @brief Have a GPU reserve resource and maintain a "copy" of an execution graph instance, allowing
+ * it to actually be schedule for execution.
+ *
+ * @note The upload can happen in one of three forms:
+ *
+ *   1. on creating of the graph instance (@ref instance_t);
+ *   2. Explicitly with this function
+ *   3. Implicitly, if @ref launch is called before the graph instance was otherwise uploaded -
+ *      before the actual scheduling of the instance for execution via the stream
+ *
+ */
+void upload(const instance_t& instance, const stream_t& stream);
+
+namespace instance {
+
+inline instance_t wrap(template_::handle_t template_handle, handle_t handle, bool  is_owning) noexcept
+{
+	return instance_t{template_handle, handle, is_owning};
+}
+
+enum : bool {
+	do_free_previous_allocations_before_relaunch    = true,
+	auto_free                                       = true,
+	dont_free_previous_allocations_before_relaunch  = false,
+	no_auto_free                                    = false,
+#if CUDA_VERSION >= 12000
+
+	do_upload_on_instantiation                      = true,
+	dont_upload_on_instantiation                    = false,
+	auto_upload                                     = true,
+	no_auto_upload                                  = false,
+	manual_upload                                   = false,
+
+	make_launchable_from_device_code                = true,
+	dont_make_launchable_from_device_code           = true,
+	do_make_device_launchable                       = true,
+	dont_make_device_launchable                     = false,
+#endif // CUDA_VERSION >= 12000
+#if CUDA_VERSION >= 11700
+
+	do_use_per_node_priorities                      = true,
+	do_use_per_node_priority                        = true,
+	dont_use_per_node_priorities                    = false,
+	dont_use_per_node_priority                      = true,
+	use_stream_priority                             = false
+#endif // CUDA_VERSION >= 11700
+};
+
+namespace detail_ {
+
+#if CUDA_VERSION >= 11000
+inline flags_t build_flags(
+	bool free_previous_allocations_before_relaunch
+#if CUDA_VERSION >= 12000
+	, bool upload_on_instantiation
+	, bool make_device_launchable
+#endif // CUDA_VERSION >= 12000
+#if CUDA_VERSION >= 11700
+	, bool use_per_node_priorities
+#endif
+	)
+{
+	return
+		(free_previous_allocations_before_relaunch ? CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH : 0)
+#if CUDA_VERSION >= 12000
+		| (upload_on_instantiation ? CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD : 0)
+		| (make_device_launchable  ? CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH : 0)
+#endif
+#if CUDA_VERSION >= 11700
+		| (use_per_node_priorities ? CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY : 0)
+#endif
+		;
+}
+#endif // CUDA_VERSION >= 11000
+
+inline ::std::string identify(const instance_t& instance)
+{
+	return identify(instance.handle()) + " instantiated from "
+		+ template_::detail_::identify(instance.template_handle());
+}
+
+inline ::std::string identify(const instance_t& instance, const template_t& template_)
+{
+	return identify(instance.handle()) + " instantiated from "
+	   + template_::detail_::identify(template_);
+}
+
+
+
+} // namespace detail_
+
+template <node::kind_t Kind>
+void set_node_parameters(
+	const instance_t&          instance,
+	const node_t&              node,
+	node::parameters_t<Kind>   parameters)
+{
+	auto status = detail_::set_node_parameters_nothrow<Kind>(
+		instance.handle(), node.handle(), node::detail_::kind_traits<Kind>::marshal(parameters));
+	throw_if_error_lazy(status, "Setting parameters of " + node::detail_::identify(node)
+		+ " in " + instance::detail_::identify(instance));
+}
+
+
+template <node::kind_t Kind>
+void set_node_parameters(
+	const instance_t&                instance,
+	const node::typed_node_t<Kind>&  node_with_new_params)
+{
+	return set_node_parameters<Kind>(
+		instance, static_cast<node_t&>(node_with_new_params), node_with_new_params.parameters());
+}
+
+
+} // namespace instance
+
+inline instance_t instantiate(
+	const template_t& template_
+#if CUDA_VERSION >= 11000
+	, bool free_previous_allocations_before_relaunch = false
+#endif
+#if CUDA_VERSION >= 12000
+	, bool upload_on_instantiation = false
+	, bool make_device_launchable = false
+#endif
+#if CUDA_VERSION >= 11700
+	, bool use_per_node_priorities = false
+#endif
+)
+{
+#if CUDA_VERSION >= 11000
+	instance::detail_::flags_t flags = instance::detail_::build_flags(
+		free_previous_allocations_before_relaunch
+#if CUDA_VERSION >= 12000
+		, upload_on_instantiation, make_device_launchable
+#endif
+#if CUDA_VERSION >= 11700
+		, use_per_node_priorities
+#endif
+	);
+#endif // CUDA_VERSION >= 11000
+	instance::handle_t instance_handle;
+#if CUDA_VERSION >= 11000
+	auto status = cuGraphInstantiateWithFlags(&instance_handle, template_.handle(), static_cast<unsigned long long>(flags));
+	throw_if_error_lazy(status, "Instantiating " + template_::detail_::identify(template_) );
+#else
+	static constexpr const size_t log_buffer_size { 2048 };
+	dynarray<char> log_buffer(log_buffer_size);
+	node::handle_t error_node;
+	auto status = cuGraphInstantiate(&instance_handle, template_.handle(), &error_node, log_buffer.data(), log_buffer_size);
+	throw_if_error_lazy(status, "Instantiating " + template_::detail_::identify(template_) + ": error at "
+		+ node::detail_::identify(error_node) + " ; log buffer contents:\n" + log_buffer.data());
+#endif // CUDA_VERSION >= 11000
+	static constexpr const bool is_owning { true };
+	return instance::wrap(template_.handle(), instance_handle, is_owning);
+}
+
+void launch(const cuda::stream_t& stream, const instance_t& instance);
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif //CUDA_API_WRAPPERS_INSTANCE_HPP
diff --git a/src/cuda/api/graph/node.hpp b/src/cuda/api/graph/node.hpp
new file mode 100644
index 00000000..a5cc3501
--- /dev/null
+++ b/src/cuda/api/graph/node.hpp
@@ -0,0 +1,152 @@
+/**
+ * @file
+ *
+ * @brief Graph template node proxy (base-)class base-class @ref node_t and supporting code.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_NODE_HPP
+#define CUDA_API_WRAPPERS_NODE_HPP
+
+#if CUDA_VERSION >= 10000
+
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace graph {
+
+///@endcond
+class node_t;
+class template_t;
+class instance_t;
+///@endcond
+
+namespace node {
+
+node_t wrap(template_::handle_t graph_handle, handle_t handle) noexcept;
+
+namespace detail_ {
+
+::std::string identify(const node_t &node);
+
+} // namespace detail_
+
+using type_t = CUgraphNodeType;
+
+} // namespace node
+
+/**
+ * @brief Wrapper class for a CUDA execution graph node
+ *
+ * Use this class to pass and receive nodes from/to other CUDA API wrapper functions
+ * and objects (in particular, `cuda::graph::template_t` and `cuda::graph::instance_t`).
+ *
+ * @note { This is a reference-type; it does not own the node, and will not remove
+ * the node from its graph-template when destroyed. You may therefore safely make copies
+ * of it. }
+ *
+ * @note { A node is always tied to a specific graph-template; it cannot be added to
+ * another graph-template or used independently. }
+ */
+class node_t {
+public: // data types
+	using handle_type = node::handle_t;
+	using dependencies_type = ::std::vector<node_t>;
+	using dependents_type = ::std::vector<node_t>;
+	using type_type = node::type_t;
+	using size_type = size_t;
+
+// TODO: WRITEME
+public:
+	handle_type handle() const noexcept  { return handle_; }
+	template_t containing_graph() const noexcept;
+	template_::handle_t containing_graph_handle() const noexcept { return graph_template_handle_; }
+	type_type type_() const
+	{
+		type_type result;
+		auto status = cuGraphNodeGetType(handle_, &result);
+		throw_if_error_lazy(status, "Obtaining the type of " + node::detail_::identify(*this));
+		return result;
+	}
+
+	size_t num_dependencies() const
+	{
+		size_t num_dependencies_;
+		auto status = cuGraphNodeGetDependencies(handle_, nullptr, &num_dependencies_);
+		throw_if_error_lazy(status, "Obtaining the number of nodes on which " + node::detail_::identify(*this) + " is dependent");
+		return num_dependencies_;
+	}
+
+	size_t num_dependents() const
+	{
+		size_t num_dependents_;
+		auto status = cuGraphNodeGetDependentNodes(handle_, nullptr, &num_dependents_);
+		throw_if_error_lazy(status, "Obtaining the number of nodes dependent on " + node::detail_::identify(*this));
+		return num_dependents_;
+	}
+
+	dependencies_type dependencies() const
+	{
+		size_type num_dependencies_ {num_dependencies() } ;
+		::std::vector<node::handle_t> node_handles {num_dependencies_ };
+		auto status = cuGraphNodeGetDependencies(handle_, node_handles.data(), &num_dependencies_);
+		throw_if_error_lazy(status, "Obtaining the set nodes on which " + node::detail_::identify(*this) + " is dependent");
+		dependencies_type result;
+		for (const auto& node_handle : node_handles) {
+			result.emplace_back(node::wrap(graph_template_handle_, node_handle));
+		}
+		return result;
+	}
+
+	dependencies_type dependents() const
+	{
+		size_type num_dependents_ { num_dependents() } ;
+		::std::vector<node::handle_t> node_handles {num_dependents_ };
+		auto status = cuGraphNodeGetDependentNodes(handle_, node_handles.data(), &num_dependents_);
+		throw_if_error_lazy(status, "Obtaining the set nodes dependent on " + node::detail_::identify(*this));
+		dependencies_type result;
+		for (const auto& node_handle : node_handles) {
+			result.emplace_back(node::wrap(graph_template_handle_, node_handle));
+		}
+		return result;
+	}
+
+protected: // constructors and destructors
+	node_t(template_::handle_t graph_template_handle, handle_type handle) noexcept
+	: graph_template_handle_(graph_template_handle), handle_(handle) { }
+
+public: // friendship
+	friend node_t node::wrap(template_::handle_t graph_handle, node::handle_t handle) noexcept;
+
+public:  // constructors and destructors
+	node_t(const node_t&) noexcept = default; // It's a reference type, so copying is not a problem
+	node_t(node_t&&) noexcept = default; // It's a reference type, so copying is not a problem
+
+	node_t& operator=(node_t other) noexcept
+	{
+		graph_template_handle_ = other.graph_template_handle_;
+		handle_ = other.handle_;
+		return *this;
+	}
+
+protected:
+	template_::handle_t graph_template_handle_;
+	handle_type handle_;
+};
+
+namespace node {
+
+inline node_t wrap(template_::handle_t graph_handle, handle_t handle) noexcept
+{
+	return { graph_handle, handle };
+}
+
+} // namespace node
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif //CUDA_API_WRAPPERS_NODE_HPP
diff --git a/src/cuda/api/graph/node_builder.hpp b/src/cuda/api/graph/node_builder.hpp
new file mode 100644
index 00000000..58f3ca19
--- /dev/null
+++ b/src/cuda/api/graph/node_builder.hpp
@@ -0,0 +1,606 @@
+/**
+ * @file
+ *
+ * @brief Convenience classes for construction execution graph nodes
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_NODE_BUILDER_HPP
+#define CUDA_API_WRAPPERS_NODE_BUILDER_HPP
+
+#if CUDA_VERSION >= 10000
+
+#include "typed_node.hpp"
+
+namespace cuda {
+
+namespace graph {
+
+namespace node {
+
+namespace detail_ {
+
+inline ::std::logic_error make_unspec_error(const char *node_type, const char *missing_arg_name)
+{
+	// Yes, returning it, not throwing it. This is an exception builder function
+	return ::std::logic_error(
+		::std::string("Attempt to build a CUDA execution graph node of type ") + node_type +
+		" without specifying its " + missing_arg_name + " argument");
+}
+
+} // namespace detail_
+
+template <kind_t Kind>
+class typed_builder_t;
+
+class builder_t
+{
+public:
+	template <kind_t Kind>
+	typed_builder_t<Kind> kind() { return typed_builder_t<Kind>{}; }
+};
+
+// Note: Can't build empty vertices for now
+
+template <>
+class typed_builder_t<kind_t::child_graph> {
+public:
+	static constexpr const kind_t kind = kind_t::child_graph;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+
+	struct {
+		bool template_ { false };
+	} was_set; // Yes, this is an ugly alternative to using optionals
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	params_type& params() noexcept { return params_; }
+
+	this_type& template_(template_t subgraph) {
+		return do_([&] {
+			params_ = std::move(subgraph);
+			was_set.template_ = true;
+		});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.template_) {
+			throw detail_::make_unspec_error("child graph", "child graph template");
+		}
+		return graph_template.insert.node<kind>(std::move(params_));
+	}
+}; // typed_builder_t<kind_t::child_graph>
+
+#if CUDA_VERSION >= 11000
+
+template <>
+class typed_builder_t<kind_t::record_event> {
+public:
+	static constexpr const kind_t kind = kind_t::record_event;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+
+	struct {
+		bool event { false };
+	} was_set; // Yes, this is an ugly alternative to using optionals
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	params_type& params() noexcept { return params_; }
+
+	this_type& event(event_t event) {
+		return do_([&] {
+			params_ = std::move(event);
+			was_set.event = true;
+		});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.event) {
+			throw detail_::make_unspec_error("record event", "event");
+		}
+		return graph_template.insert.node<kind>(std::move(params_));
+	}
+}; // typed_builder_t<kind_t::record_event>
+
+template <>
+class typed_builder_t<kind_t::wait_on_event> {
+public:
+	static constexpr const kind_t kind = kind_t::wait_on_event;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+
+	struct {
+		bool event { false };
+	} was_set; // Yes, this is an ugly alternative to using optionals
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	params_type& params() noexcept { return params_; }
+
+	this_type& event(event_t event) {
+		return do_([&] {
+			params_ = std::move(event);
+			was_set.event = true;
+		});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.event) {
+			throw detail_::make_unspec_error("wait on event", "event");
+		}
+		return graph_template.insert.node<kind>(std::move(params_));
+	}
+}; // typed_builder_t<kind_t::wait_event>
+
+#endif // CUDA_VERSION >= 11000
+
+template <>
+class typed_builder_t<kind_t::host_function_call> {
+public:
+	static constexpr const kind_t kind = kind_t::host_function_call;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+
+	struct {
+		bool function_ptr_set { false };
+		bool user_argument_set {false };
+	} was_set; // Yes, this is an ugly alternative to using optionals
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	params_type& params() noexcept { return params_; }
+
+	this_type function(stream::callback_t host_callback_function)
+	{
+		return do_([&] {
+			params_.function_ptr = host_callback_function;
+			was_set.function_ptr_set = true;
+		});
+	}
+
+	this_type argument(void* callback_argument)
+	{
+		return do_([&] {
+			params_.user_data = callback_argument;
+			was_set.user_argument_set = true;
+		});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.function_ptr_set) {
+			throw detail_::make_unspec_error("kernel_launch", "host callback function pointer");
+		}
+		if (not was_set.user_argument_set) {
+			throw detail_::make_unspec_error("kernel_launch", "user-specified callback function argument");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+}; // typed_builder_t<kind_t::host_function_call>
+
+template <>
+class typed_builder_t<kind_t::kernel_launch> {
+public:
+	static constexpr const kind_t kind = kind_t::kernel_launch;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_ {
+		kernel_t { kernel::wrap(cuda::device::id_t(0), nullptr, nullptr) },
+		{ 0, 0 },
+		{ }
+	}; // An ugly way of constructing with invalid junk; see `was_set` below. We could
+	   // have possibly used some kind of optional
+
+	struct {
+		bool kernel { false };
+		bool launch_config { false };
+		bool marshalled_arguments { false };
+	} was_set; // Yes, this is an ugly alternative to using optionals; but - have
+	           // you ever looked at the implementation of optional?...
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	params_type& params() noexcept { return params_; }
+
+	this_type kernel(const kernel_t& kernel)
+	{
+		return do_([&] {
+			// we can't just make an assignment to the `kernel` field, we have to reassign
+			// the whole structure...
+			params_ = { kernel, params_.launch_config, std::move(params_.marshalled_arguments) };
+			was_set.kernel = true;
+		});
+	}
+
+	// Note: There is _no_ member for passing an apriori compiled kernel
+	// function and a device, since that would either mean leaking a primary context ref unit,
+	// or actually holding on to one in this class, which doesn't make sense. The graph template
+	// can't hold a ref unit...
+
+	this_type launch_configuration(launch_configuration_t launch_config)
+	{
+		return do_([&] {
+			params_.launch_config = launch_config;
+			was_set.launch_config = true;
+		});
+	}
+
+	this_type marshalled_arguments(::std::vector<void*> argument_ptrs)
+	{
+		return do_([&] {
+			params_.marshalled_arguments = std::move(argument_ptrs);
+			was_set.marshalled_arguments = true;
+		});
+	}
+
+	template <typename... Ts>
+	this_type arguments(Ts&&... args)
+	{
+		return marshalled_arguments(make_kernel_argument_pointers(::std::forward<Ts>(args)...));
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.kernel) {
+			throw detail_::make_unspec_error("kernel_launch", "kernel");
+		}
+		if (not was_set.launch_config) {
+			throw detail_::make_unspec_error("kernel_launch", "launch configuration");
+		}
+		if (not was_set.marshalled_arguments) {
+			throw detail_::make_unspec_error("kernel_launch", "launch arguments");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+}; // typed_builder_t<kind_t::kernel_launch>
+
+#if CUDA_VERSION >= 11000
+
+template <>
+class typed_builder_t<kind_t::memory_allocation> {
+public:
+	static constexpr const kind_t kind = kind_t::memory_allocation;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+	using endpoint_t = cuda::memory::endpoint_t;
+
+protected:
+	params_type params_;
+
+	struct {
+		bool device { false };
+		bool size_in_bytes {false };
+	} was_set; // Yes, this is an ugly alternative to using optionals
+
+	template <typename F>
+	this_type& do_(F f)
+	{
+		f();
+		return *this;
+	}
+
+public:
+	params_type& params() { return params_; }
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.device) {
+			throw detail_::make_unspec_error("memory allocation", "device");
+		}
+		if (not was_set.size_in_bytes) {
+			throw detail_::make_unspec_error("memory allocation", "allocation size in bytes");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+
+	this_type& device(const device_t& device) {
+		return do_([&]{ params_.first = device; was_set.device = true; });
+	}
+	this_type& size(const size_t size) {
+		return do_([&]{ params_.second = size; was_set.size_in_bytes = true; });
+	}
+}; // typed_builder_t<kind_t::memory_allocation>
+
+#endif // CUDA_VERSION >= 11000
+
+template <>
+class typed_builder_t<kind_t::memory_copy> {
+public:
+	static constexpr const kind_t kind = kind_t::memory_copy;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+	using dimensions_type = params_type::dimensions_type;
+	using endpoint_t = cuda::memory::endpoint_t;
+//	static constexpr const dimensionality_t num_dimensions = traits::num_dimensions;
+
+
+protected:
+	params_type params_;
+
+	template <typename F>
+	this_type& do_(F f)
+	{
+		f();
+		return *this;
+	}
+
+public:
+	params_type& params() { return params_; }
+//	built_type build();
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		// TODO: What about the extent???!!!
+		return graph_template.insert.node<kind>(params_);
+	}
+
+//	this_type& context(endpoint_t endpoint, const context_t& context) noexcept
+//	{
+//		do_([&] { params_.set_context(endpoint, context); } );
+//	}
+//
+//	this_type& single_context(const context_t& context) noexcept
+//	{
+//		do_([&] { params_.set_single_context(context); } );
+//	}
+
+	// Note: This next variadic method should not be necessary considering
+	// the one right after it which uses the forwarding idiom; and yet - if we
+	// only keep the forwarding-source-method, we get errors.
+//	template <typename... Ts>
+//	this_type& source(const Ts&... args) {
+//		return do_([&]{ params_.set_source(args...); });
+//	}
+
+	template <typename... Ts>
+	this_type& source(Ts&&... args) {
+		return do_([&]{ params_.set_source(std::forward<Ts>(args)...); });
+	}
+//
+//	template <typename... Ts>
+//	this_type& destination(const Ts&... args) {
+//		return do_([&]{ params_.set_destination(args...); });
+//	}
+
+	template <typename... Ts>
+	this_type& destination(Ts&&... args) {
+		return do_([&]{ params_.set_destination(std::forward<Ts>(args)...); });
+	}
+
+	template <typename... Ts>
+	this_type& endpoint(endpoint_t endpoint, Ts&&... args) {
+		return do_([&]{ params_.set_endpoint(endpoint, std::forward<Ts>(args)...); });
+	}
+
+//	this_type& source_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept
+//	{
+//		return do_([&] { params_.set_endpoint_untyped(endpoint_t::source, context_handle, ptr, dimensions); } );
+//	}
+//
+//	this_type& destination_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept
+//	{
+//		return do_([&] { params_.set_destination_untyped(context_handle, ptr, dimensions); } );
+//	}
+//
+//	this_type& endpoint_untyped(endpoint_t endpoint, context::handle_t context_handle, void *ptr, dimensions_type dimensions) noexcept
+//	{
+//		return do_([&] { params_.set_endpoint_untyped(endpoint_t::source, context_handle, ptr, dimensions); } );
+//	}
+
+	// TODO: Need a proper builder for copy parameters; otherwise we'll need to implement one here, when it's
+	// already half-implemented there... it will need:
+	// 1. To sort out context stuff (already done in the copy parameters, but requires explicit setting atm
+	// 2. deduce extent when none specified
+	// 3. prevent direct manipulation of the parameters (which is currently allowed), so that we can apply logic
+	//    such as "has the extent been set?"  etc.
+	// 4. set defaults when relevant, e.g. w.r.t. pitches and such
+}; // typed_builder_t<kind_t::memory_copy>
+
+template <>
+class typed_builder_t<kind_t::memory_set> {
+	// Note: Unlike memory_copy, for which the underlying parameter type, CUDA_MEMCPY3D_PEER, is also used
+	// in non-graph context - here the only builder functionality is for graph vertex construction; so we don't
+	// do any forwarding to a rich parameters class or its own builder.
+public:
+	static constexpr const kind_t kind = kind_t::memory_set;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+	struct {
+		bool region { false };
+		bool value_and_width { false };
+	} was_set;
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+	template <typename T>
+	void set_width() {
+	}
+
+public:
+	const params_type& params() { return params_; }
+
+	this_type region(memory::region_t region) noexcept
+	{
+		return do_([&] { params_.region = region; was_set.region = true;});
+	}
+
+	template <typename T>
+	this_type value(uint32_t v) noexcept
+	{
+		static_assert(sizeof(T) <= 4, "Type of value to set is too wide; maximum size is 4");
+		static_assert(sizeof(T) != 3, "Size of type to set is not a power of 2");
+		static_assert(std::is_trivially_copy_constructible<T>::value, "Only a trivially-constructible value can be used for memset'ing");
+		return do_([&] {
+			params_.width_in_bytes = sizeof(T);
+			switch(sizeof(T)) {
+				// TODO: Maybe we should use uint_t<N> template? Maybe use if constexpr with C++17?
+			case 1:  params_.value = reinterpret_cast<uint8_t&>(v); break;
+			case 2:  params_.value = reinterpret_cast<uint16_t&>(v); break;
+			case 4:
+			default: params_.value = reinterpret_cast<uint32_t&>(v); break;
+			}
+			was_set.value_and_width = true;
+		});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.region) {
+			throw detail_::make_unspec_error("memory set", "memory region");
+		}
+		if (not was_set.value_and_width) {
+			throw detail_::make_unspec_error("memory set", "value to set");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+}; // typed_builder_t<kind_t::memory_set>
+
+#if CUDA_VERSION >= 11000
+template <>
+class typed_builder_t<kind_t::memory_free> {
+public:
+	static constexpr const kind_t kind = kind_t::memory_free;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+	struct {
+		bool address { false };
+	} was_set;
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	const params_type& params() { return params_; }
+
+	this_type region(void* address) noexcept { return do_([&] { params_ = address; was_set.address = true;}); }
+	this_type region(memory::region_t allocated_region) noexcept { return this->region(allocated_region.data()); }
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.address) {
+			throw detail_::make_unspec_error("memory free", "allocated region starting address");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+}; // typed_builder_t<kind_t::memory_free>
+
+#endif // CUDA_VERSION >= 11000
+
+#if CUDA_VERSION >= 11070
+template <>
+class typed_builder_t<kind_t::memory_barrier> {
+public:
+	static constexpr const kind_t kind = kind_t::memory_barrier;
+	using this_type = typed_builder_t<kind>;
+	using built_type = typed_node_t<kind>;
+	using traits = cuda::graph::node::detail_::kind_traits<kind>;
+	using params_type = traits::parameters_type;
+
+protected:
+	params_type params_;
+	struct {
+		bool context { false };
+		bool barrier_socpe { false };
+	} was_set;
+
+	// This wrapper method ensures the builder-ish behavior, i.e. always returning the builder
+	// for further work via method invocation.
+	template <typename F> this_type& do_(F f) { f(); return *this; }
+
+public:
+	const params_type& params() { return params_; }
+
+	this_type context(context_t context) noexcept
+	{
+		return do_([&] {
+			params_.first = std::move(context);
+			was_set.context = true;});
+	}
+
+	this_type context(memory::barrier_scope_t barrier_socpe) noexcept
+	{
+		return do_([&] { params_.second = barrier_socpe; was_set.barrier_socpe = true;});
+	}
+
+	[[maybe_unused]] built_type	build_within(const cuda::graph::template_t& graph_template)
+	{
+		if (not was_set.context) {
+			throw detail_::make_unspec_error("memory barrier", "CUDA context");
+		}
+		if (not was_set.barrier_socpe) {
+			throw detail_::make_unspec_error("memory barrier", "barrier scope");
+		}
+		return graph_template.insert.node<kind>(params_);
+	}
+}; // typed_builder_t<kind_t::memory_barrier>
+
+#endif // CUDA_VERSION >= 11070
+
+} // namespace node
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif //CUDA_API_WRAPPERS_NODE_BUILDER_HPP
diff --git a/src/cuda/api/graph/template.hpp b/src/cuda/api/graph/template.hpp
new file mode 100644
index 00000000..bc7e3f34
--- /dev/null
+++ b/src/cuda/api/graph/template.hpp
@@ -0,0 +1,829 @@
+/**
+ * @file
+ *
+ * @brief An implementation of a subclass of @ref `kernel_t` for kernels
+ * compiled together with the host-side program.
+ *
+ * @todo Implement batch mem op insertion and param setting:
+ * cuGraphAddBatchMemOpNode, cuGraphBatchMemOpNodeGetParams, cuGraphBatchMemOpNodeSetParams
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_GRAPH_TEMPLATE_HPP
+#define CUDA_API_WRAPPERS_GRAPH_TEMPLATE_HPP
+
+#if CUDA_VERSION >= 10000
+
+#include <cuda/api/graph/typed_node.hpp>
+#include <cuda/api/graph/node.hpp>
+#include <cuda/api/types.hpp>
+#include <cuda/api/error.hpp>
+#include <cuda/api/graph/identify.hpp>
+
+#include <vector>
+#include <cassert>
+#include <algorithm>
+
+namespace cuda {
+
+///@cond
+class device_t;
+class stream_t;
+///@endcond
+
+namespace graph {
+
+///@endcond
+class template_t;
+class instance_t;
+///@endcond
+
+namespace node {
+
+namespace detail_ {
+
+// I'm not so sure about this...
+using edge_t = ::std::pair<node_t, node_t>;
+
+inline ::std::string identify(const edge_t &edge)
+{
+	return ::std::string("edge from " + node::detail_::identify(edge.first)
+					   + " to " + node::detail_::identify(edge.second));
+}
+
+template <typename NodeOrHandle>
+handle_t as_handle(const NodeOrHandle& node_or_handle) noexcept
+{
+	return node_or_handle.handle();
+}
+
+template <> inline handle_t as_handle(const handle_t& handle) noexcept { return handle; }
+
+template <template <typename> class Container, typename NodeOrHandle>
+struct as_handles_partial_specialization_helper{
+	static typename ::std::conditional<
+	    ::std::is_same<NodeOrHandle, handle_t>::value,
+		span<handle_t>,
+		::std::vector<handle_t>
+	>::type
+	as_handles(Container<NodeOrHandle>&& nodes_or_handles)
+	{
+		static_assert(
+			::std::is_same<typename ::std::remove_const<NodeOrHandle>::type, node::handle_t>::value or
+			::std::is_same<typename ::std::remove_const<NodeOrHandle>::type, node::handle_t>::value,
+			"Unsupported graph node dependency specifier type. Use either cuda::graph::node_t or cuda::graph::node::handle_t");
+		::std::vector<handle_t> handles;
+		handles.reserve(nodes_or_handles.size());
+		::std::transform(
+			nodes_or_handles.begin(),
+			nodes_or_handles.end(),
+			::std::back_inserter(handles),
+			as_handle<NodeOrHandle> );
+		return handles;
+	}
+};
+
+template <template <typename> class Container>
+struct as_handles_partial_specialization_helper<Container, handle_t> {
+	Container<handle_t> as_handles(Container<handle_t>&& node_handles)
+	{
+		return node_handles;
+	}
+};
+
+template <template <typename> class Container, typename NodeOrHandle>
+static typename ::std::conditional<
+	::std::is_same<NodeOrHandle, handle_t>::value,
+	span<handle_t>,
+	::std::vector<handle_t>
+>::type
+as_handles(Container<NodeOrHandle>&& nodes_or_handles)
+{
+	return as_handles_partial_specialization_helper<Container, handle_t>::as_handles(
+		::std::forward<Container<NodeOrHandle>>(nodes_or_handles));
+}
+
+
+} // namespace detail_
+
+} // namespace node
+
+namespace template_ {
+
+template_t wrap(handle_t handle, bool take_ownership = false) noexcept;
+
+namespace detail_ {
+
+::std::string identify(const template_t& template_);
+
+inline status_t delete_edges(
+	template_::handle_t         template_handle,
+	span<const node::handle_t>  edge_source_handles,
+	span<const node::handle_t>  edge_destination_handles)
+{
+	auto num_edges = edge_source_handles.size();
+	assert(edge_source_handles.size() == num_edges && "Mismatched sizes of sources and destinations");
+
+	auto result = cuGraphRemoveDependencies(
+		template_handle,edge_source_handles.data(), edge_destination_handles.data(), num_edges);
+	return result;
+}
+
+inline status_t delete_edges(
+	template_::handle_t  template_handle,
+	span<const node_t>   edge_sources,
+	span<const node_t>   edge_destinations)
+{
+	auto num_edges = edge_sources.size();
+	assert(edge_destinations.size() == num_edges && "Mismatched sizes of sources and destinations");
+
+	// TODO: With C++14, consider make_unique here and no container
+	auto handles_buffer = ::std::vector<node::handle_t>{num_edges * 2};
+	{
+		auto handles_iter = handles_buffer;
+		::std::transform(edge_sources.begin(), edge_sources.end(), handles_buffer.data(),
+			[](const node_t &node) { return node.handle(); });
+		::std::transform(edge_destinations.begin(), edge_destinations.end(), handles_buffer.data() + num_edges,
+			[](const node_t &node) { return node.handle(); });
+	}
+	span<const node::handle_t> edge_source_handles { handles_buffer.data(), num_edges };
+	span<const node::handle_t> edge_destination_handles { handles_buffer.data() + num_edges, num_edges };
+	return delete_edges(template_handle, edge_source_handles, edge_destination_handles);
+}
+
+// Note: duplication of code with delete_edges
+inline status_t insert_edges(
+	template_::handle_t  template_handle,
+	span<const node_t>   edge_sources,
+	span<const node_t>   edge_destinations)
+{
+	auto num_edges = edge_sources.size();
+	assert(edge_destinations.size() == num_edges && "Mismatched sizes of sources and destinations");
+
+	// TODO: With C++14, consider make_unique here and no container
+	auto handles_buffer = ::std::vector<node::handle_t>{num_edges * 2};
+	{
+		auto handles_iter = handles_buffer;
+		::std::transform(edge_sources.begin(), edge_sources.end(), handles_buffer.data(),
+			[](const node_t &node) { return node.handle(); });
+		::std::transform(edge_destinations.begin(), edge_destinations.end(), handles_buffer.data() + num_edges,
+			[](const node_t &node) { return node.handle(); });
+	}
+	const node::handle_t* sources_handles = handles_buffer.data();
+	const node::handle_t* destinations_handles = handles_buffer.data() + num_edges;
+	auto result = cuGraphAddDependencies(
+		template_handle,sources_handles, destinations_handles, edge_sources.size());
+	return result;
+}
+
+inline status_t delete_edges(
+	template_::handle_t template_handle,
+	span<const node::detail_::edge_t> edges)
+{
+	// TODO: With C++14, consider make_unique here
+	auto handles_buffer = ::std::vector<node::handle_t>{edges.size() * 2};
+	auto sources_iterator = handles_buffer.begin();
+	auto  destinations_iterator = handles_buffer.begin() + edges.size();
+	for(const auto& edge : edges) {
+		*(sources_iterator++)      = edge.first.handle();
+		*(destinations_iterator++) = edge.second.handle();
+	}
+	const node::handle_t* sources_handles = handles_buffer.data();
+	const node::handle_t* destinations_handles = handles_buffer.data() + edges.size();
+	auto result = cuGraphRemoveDependencies(
+		template_handle,sources_handles, destinations_handles, edges.size());
+	return result;
+}
+
+// Note: duplication of code with delete_edges
+inline status_t insert_edges(
+	template_::handle_t               template_handle,
+	span<const node::detail_::edge_t> edges)
+{
+	// TODO: With C++14, consider make_unique here
+	auto handles_buffer = ::std::vector<node::handle_t>{edges.size() * 2};
+	auto sources_iterator = handles_buffer.begin();
+	auto  destinations_iterator = handles_buffer.begin() + edges.size();
+	for(const auto& edge : edges) {
+		*(sources_iterator++)      = edge.first.handle();
+		*(destinations_iterator++) = edge.second.handle();
+	}
+	const node::handle_t* sources_handles = handles_buffer.data();
+	const node::handle_t* destinations_handles = handles_buffer.data() + edges.size();
+	auto result = cuGraphAddDependencies(
+		template_handle,sources_handles, destinations_handles, edges.size());
+	return result;
+}
+
+template <node::kind_t Kind>
+status_t invoke_inserter_possibly_with_context(
+	cuda::detail_::bool_constant<false>,
+	node::handle_t&      new_node_handle,
+	template_::handle_t  graph_template_handle,
+	CUgraphNode*         dependency_handles,
+	size_t               num_dependency_handles,
+	typename node::detail_::kind_traits<Kind>::raw_parameters_type&
+	                     raw_params,
+	context::handle_t)
+{
+	auto raw_params_maybe_ptr = node::detail_::maybe_add_ptr<Kind>(raw_params);
+	return node::detail_::kind_traits<Kind>::inserter(
+		&new_node_handle,
+		graph_template_handle,
+		dependency_handles,
+		num_dependency_handles,
+		raw_params_maybe_ptr);
+}
+
+template <node::kind_t Kind>
+status_t invoke_inserter_possibly_with_context(
+	cuda::detail_::bool_constant<true>,
+	node::handle_t&      new_node_handle,
+	template_::handle_t  graph_template_handle,
+	CUgraphNode*         dependency_handles,
+	size_t               num_dependency_handles,
+	typename node::detail_::kind_traits<Kind>::raw_parameters_type&
+	                     raw_params,
+	context::handle_t    context_handle)
+{
+	auto raw_params_maybe_ptr = node::detail_::maybe_add_ptr<Kind>(raw_params);
+	return node::detail_::kind_traits<Kind>::inserter(
+		&new_node_handle,
+		graph_template_handle,
+		dependency_handles,
+		num_dependency_handles,
+		raw_params_maybe_ptr,
+		context_handle);
+}
+
+template <node::kind_t Kind>
+node::handle_t insert_node(
+	template_::handle_t graph_template_handle,
+	context::handle_t context_handle,
+	typename node::detail_::kind_traits<Kind>::raw_parameters_type raw_params)
+{
+	using traits_type = typename node::detail_::kind_traits<Kind>;
+
+	if (traits_type::inserter_takes_context and context_handle == context::detail_::none) {
+		throw ::std::invalid_argument(
+			"Attempt to insert a CUDA graph template " + ::std::string(traits_type::name)
+			+ " node without specifying an execution context");
+	}
+
+	node::handle_t new_node_handle;
+	auto no_dependency_handles = nullptr;
+	size_t no_dependencies_size = 0;
+	auto status = invoke_inserter_possibly_with_context<Kind>(
+		cuda::detail_::bool_constant<traits_type::inserter_takes_context>{},
+		new_node_handle,
+		graph_template_handle,
+		no_dependency_handles,
+		no_dependencies_size,
+		raw_params,
+		context_handle);
+	throw_if_error_lazy(status, "Inserting a " + ::std::string(traits_type::name) + " node into "
+								+ template_::detail_::identify(graph_template_handle));
+	return new_node_handle;
+}
+
+template <node::kind_t Kind, typename... Ts>
+node::typed_node_t<Kind> build_params_and_insert_node(
+	template_::handle_t  graph_template_handle,
+	context::handle_t    context_handle,
+	Ts&&...              params_ctor_args)
+{
+
+	using traits_type = typename node::detail_::kind_traits<Kind>;
+	using parameters_t = typename traits_type::parameters_type;
+
+	// TODO: Why won't this work?
+	// static_assert(::std::is_constructible<parameters_t, Ts...>::value,
+	//	"Node parameters are not constructible from the arguments passed");
+
+	parameters_t params { std::forward<Ts>(params_ctor_args)... };
+	typename traits_type::raw_parameters_type raw_params = traits_type::marshal(params);
+	auto node_handle = insert_node<Kind>(graph_template_handle, context_handle, raw_params);
+	return node::wrap<Kind>(graph_template_handle, node_handle, std::move(params));
+}
+
+template <node::kind_t Kind, typename... Ts>
+node::typed_node_t<Kind> get_context_handle_build_params_and_insert_node(
+	cuda::detail_::true_type, // we've been given a context
+	template_::handle_t graph_template_handle,
+	const context_t& context,
+	Ts&&... params_ctor_args)
+{
+	return build_params_and_insert_node<Kind>(graph_template_handle, context.handle(), std::forward<Ts>(params_ctor_args)...);
+}
+
+template <node::kind_t Kind, typename... Ts>
+node::typed_node_t<Kind> get_context_handle_build_params_and_insert_node(
+	cuda::detail_::false_type, // We've not been given a context
+	template_::handle_t graph_template_handle,
+	Ts&&... params_ctor_args)
+{
+	auto current_context_handle = context::current::detail_::get_handle();
+	// TODO: Consider handling the case of no current context, e.g. by using the default device' primary context
+	return build_params_and_insert_node<Kind>(
+		graph_template_handle, current_context_handle, std::forward<Ts>(params_ctor_args)...);
+}
+
+
+template <node::kind_t Kind, typename... Ts>
+node::typed_node_t<Kind> build_params_and_insert_node_wrapper(
+	cuda::detail_::false_type , // inserter doesn't takes a context
+	template_::handle_t graph_template_handle,
+	Ts&&... params_ctor_args)
+{
+	return build_params_and_insert_node<Kind>(graph_template_handle, context::detail_::none, std::forward<Ts>(params_ctor_args)...);
+}
+
+template <node::kind_t Kind, typename T, typename... Ts>
+node::typed_node_t<Kind> build_params_and_insert_node_wrapper(
+	cuda::detail_::true_type, // inserter takes a context
+	template_::handle_t graph_template_handle,
+	T&& first_arg, // still don't know of T is a context or something else
+	Ts&&... params_ctor_args)
+{
+	static constexpr const bool first_arg_is_a_context =
+		::std::is_same<typename cuda::detail_::remove_reference_t<T>, cuda::context_t>::value;
+	return get_context_handle_build_params_and_insert_node<Kind>(
+//return blah<Kind>(
+		cuda::detail_::bool_constant<first_arg_is_a_context>{},
+		graph_template_handle, ::std::forward<T>(first_arg), ::std::forward<Ts>(params_ctor_args)...);
+}
+
+} // namespace detail_
+
+} // namespace template_
+
+/**
+ * @brief A template for a CUDA (execution) graph, which may be instantiated
+ * (repeatedly) for actual execution on a CUDA stream.
+ *
+ * @todo Should this class, itself, be a C++ container of the nodes? That would
+ * require caching the nodes at some point.
+ */
+class template_t {
+public: // type definitions
+	using size_type = size_t;
+	using handle_type = template_::handle_t;
+
+	using node_ref_type = node_t; /// TODO: Should I use a proper ref instead? I wonder...
+
+	/// This is essentially a reference-type
+	/// TODO: Should we use a custom structure here?
+	using edge_type = ::std::pair<node_ref_type, node_ref_type>;
+
+	using node_ref_container_type = ::std::vector<node_t>;
+
+	/// This is a structure-of-arrays, where the i'th elements of each node container
+	/// constitute an edge.
+	using edge_container_type = ::std::vector<edge_type>;
+
+public: // getters
+
+	/// The raw CUDA handle for this event
+	handle_type handle() const noexcept { return handle_; }
+
+	/// True if this wrapper is responsible for telling CUDA to destroy the event upon the wrapper's own destruction
+	bool is_owning() const noexcept { return owning_; }
+
+public: // non-mutators
+
+#if CUDA_VERSION >= 11000
+	struct dot_printing_options_t {
+		bool debug_data;
+		bool use_runtime_types;
+		// TODO: Consider having a map/array mapping of kind_t's to bools
+		struct {
+			struct {
+				bool kernel;
+				bool host_function;
+			} launch;
+			struct {
+				bool allocate;
+				bool free;
+				bool copy;
+				bool set;
+			} memory_ops;
+			bool event;
+			struct {
+				bool signal;
+				bool wait;
+			} external_semaphore;
+		} node_params;
+		bool kernel_node_attributes;
+		bool node_and_kernel_function_handles;
+
+		unsigned compose() const {
+			return 0u
+				| (debug_data                            ? CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE : 0)
+				| (use_runtime_types                     ? CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES : 0)
+				| (node_params.launch.kernel             ? CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS : 0)
+				| (node_params.launch.host_function      ? CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS : 0)
+				| (node_params.memory_ops.allocate       ? CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS : 0)
+				| (node_params.memory_ops.free           ? CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS : 0)
+				| (node_params.memory_ops.copy           ? CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS : 0)
+				| (node_params.memory_ops.set            ? CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS : 0)
+				| (node_params.event                     ? CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS : 0)
+				| (node_params.external_semaphore.signal ? CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS : 0)
+				| (node_params.external_semaphore.wait   ? CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS : 0)
+				| (kernel_node_attributes                ? CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES : 0)
+				| (node_and_kernel_function_handles      ? CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES : 0)
+				;
+		}
+		// TODO: Consider initializing all fields to false in the default constructor
+	};
+#endif
+
+	/**
+	 * @brief Make a copy not just of this wrapper class instance, but of the actual graph template held by the CUDA driver
+	 *
+	 * @note Copying or copy-assigning a template_t object does _not_ have CUDA make a copy of its internal
+	 * representation of the graph; thus, if you write `auto graph2 = graph1`, then add or remove nodes or edges
+	 * via `graph2` - they will also be added or removed from the graph referred to by `graph1`.
+	 */
+	template_t clone() const
+	{
+		handle_type clone_handle;
+		auto status = cuGraphClone(&clone_handle, handle_);
+		throw_if_error_lazy(status, "Cloning " + template_::detail_::identify(*this));
+		return template_t{ clone_handle, do_take_ownership };
+	}
+
+#if CUDA_VERSION >= 11000
+	/**
+	 * @brief serialize a representation of the structure of this graph template in the GraphViz DOT
+	 * format, to a file.
+	 *
+	 * @param[in] dot_filename Path of the DOT file to serialize the graph template into.
+	 * @param[in] printing_options Options passed to the CUDA driver, controlling which aspects of
+	 * the graph template representation to include in the DOT and which to skip/suppress.
+	 */
+	void print_dot(const char* dot_filename, dot_printing_options_t printing_options = {}) const
+	{
+		auto status = cuGraphDebugDotPrint(handle_, dot_filename, printing_options.compose());
+		throw_if_error_lazy(status, "Printing " + template_::detail_::identify(*this) + " to file " + dot_filename);
+	}
+#endif // CUDA_VERSION >= 11000
+	/**
+	 * @brief get the number of nodes  in the graph template.
+	 */
+	size_type num_nodes() const
+	{
+		::std::size_t num_nodes_;
+		auto status = cuGraphGetNodes(handle_, nullptr, &num_nodes_);
+		throw_if_error_lazy(status, "Obtaining the number of nodes in " + template_::detail_::identify(*this));
+		return num_nodes_;
+	}
+
+	/**
+	 * @brief returns a container of (wrapped) references to the nodes in the graph template.
+	 *
+	 * @todo Currently, this makes two heap allocations, then releases one. Instead, we could return
+	 * a structure wrapping the node handles, whose operator[], at(), and iterator dereferencing
+	 * build (non-owning) @ref node_t wrappers around the handles.
+	 */
+	node_ref_container_type nodes() const
+	{
+		size_type num_nodes_ { num_nodes() } ;
+		::std::vector<node::handle_t> node_handles {num_nodes_ };
+		auto status = cuGraphGetNodes(handle_, node_handles.data(), &num_nodes_);
+		throw_if_error_lazy(status, "Obtaining the set of nodes of " + template_::detail_::identify(*this));
+		node_ref_container_type node_refs;
+		for (const auto& node_handle : node_handles) {
+			node_refs.emplace_back(node::wrap(handle_, node_handle));
+		}
+		return node_refs;
+	}
+
+	/**
+	 * @brief get the number of root nodes in the graph template, i.e. the nodes with no incoming edges,
+	 * i.e. no dependencies on any other nodes.
+	 */
+	size_type num_roots() const
+	{
+		// Note: Code duplication with num_nodes()
+		::std::size_t num_roots_;
+		auto status = cuGraphGetRootNodes(handle_, nullptr, &num_roots_);
+		throw_if_error_lazy(status, "Obtaining the number of root nodes in " + template_::detail_::identify(*this));
+		return num_roots_;
+	}
+
+	/**
+	 * @brief get the number of root nodes in the graph template, i.e. the nodes with no incoming edges,
+	 * i.e. no dependencies on any other nodes.
+	 */
+	node_ref_container_type roots() const
+	{
+		// Note: Code duplication wiuth nodes()
+		size_type num_roots_ {num_roots() } ;
+		::std::vector<node::handle_t> root_node_handles {num_roots_ };
+		auto status = cuGraphGetRootNodes(handle_, root_node_handles.data(), &num_roots_);
+		throw_if_error_lazy(status, "Obtaining the set of root nodes of " + template_::detail_::identify(*this));
+		node_ref_container_type root_node_refs;
+		for (const auto& node_handle : root_node_handles) {
+			root_node_refs.emplace_back(node::wrap(handle_, node_handle));
+		}
+		return root_node_refs;
+	}
+
+	/// Get the number (directed) edges, i.e. dependencies between nodes, in the execution graph.
+	size_type num_edges() const
+	{
+		size_type num_edges;
+		auto status = cuGraphGetEdges(handle_, nullptr, nullptr, &num_edges);
+		throw_if_error_lazy(status, "Obtaining the number of edges in " + template_::detail_::identify(*this));
+		return num_edges;
+	}
+
+	edge_container_type edges() const
+	{
+		size_type num_edges_ { num_edges() } ;
+		::std::vector<node::handle_t> from_node_handles { num_edges_ };
+		::std::vector<node::handle_t> to_node_handles { num_edges_ };
+		auto status = cuGraphGetEdges(handle_, from_node_handles.data(), to_node_handles.data(), &num_edges_);
+		throw_if_error_lazy(status, "Obtaining the set of edges in " + template_::detail_::identify(*this));
+		edge_container_type edges;
+		// TODO: Use container/range zipping, and a ranged-for loop
+		{
+			auto from_iter = from_node_handles.cbegin();
+			auto to_iter = from_node_handles.cbegin();
+			for (; from_iter != from_node_handles.cend(); from_iter++, to_iter++) {
+				assert(to_iter != to_node_handles.cend());
+				auto from_node_ref = node::wrap(handle_, *from_iter);
+				auto to_node_ref = node::wrap(handle_, *to_iter);
+				edges.emplace_back(from_node_ref, to_node_ref);
+			}
+		}
+		return edges;
+	}
+
+	/**
+	 * @brief A gadget through which nodes are inserted into the graph template
+	 *
+	 * @note{Consider replacing with a single method which takes a reflection-enum,
+	 * e.g. `my_stream.insert(node::type_t::copy, param1, param2)`}
+	 *
+	 * @note{The runtime API supposedly offers to insert "copy to symbol" and "copy from symbol"
+	 * nodes. These are actually just plain 1D memory copies; use
+	 * @ref cuda::memory::symbol::locate() to obtain the symbol's address. }
+	 */
+	class insert_t {
+	protected:
+		const template_t& associated_template;
+
+		template_::handle_t handle() const noexcept { return associated_template.handle(); }
+
+	public:
+		insert_t(const template_t& template_) : associated_template(template_) {}
+
+		void edge(node_ref_type source, node_ref_type dest) const
+		{
+			struct {
+				const node::handle_t source;
+				const node::handle_t dest;
+			} handles { source.handle(), dest.handle() };
+			static constexpr const size_t remove_just_one = 1;
+			auto status = cuGraphAddDependencies(
+				handle(), &handles.source, &handles.dest, remove_just_one);
+
+			throw_if_error_lazy(status, "Inserting " + node::detail_::identify(edge_type{source, dest})
+				+ " into " + template_::detail_::identify(associated_template));
+		}
+
+		void edge(edge_type edge_) const
+		{
+			return edge(edge_.first, edge_.second);
+		}
+
+		void edges(span<const node_ref_type> sources, span<const node_ref_type> destinations) const
+		{
+			if (sources.size() != destinations.size()) {
+				throw ::std::invalid_argument(
+					"Differing number of source nodes and destination nodes ("
+					+ ::std::to_string(sources.size()) + " != " + ::std::to_string(destinations.size())
+					+ " in a request to insert edges into " + template_::detail_::identify(associated_template) );
+			}
+			auto status = template_::detail_::insert_edges(handle(), sources, destinations);
+
+			throw_if_error_lazy(status, "Destroying " + ::std::to_string(sources.size()) + " edges in "
+				+ template_::detail_::identify(associated_template));
+		}
+
+		void edges(span<const edge_type> edges) const
+		{
+			auto status = template_::detail_::insert_edges(handle(), edges);
+
+			throw_if_error_lazy(status, "Inserting " + ::std::to_string(edges.size()) + " edges into "
+				+ template_::detail_::identify(associated_template));
+		}
+
+		template <node::kind_t Kind, typename T, typename... Ts>
+		typename node::typed_node_t<Kind> node(
+			T&& arg, Ts&&... node_params_ctor_arguments) const
+		{
+			// Note: arg may be either the first parameters constructor argument, or a context passed
+			// before the constructor arguments; due to the lack of C++17's if constexpr, we can only act
+			// on this knowledge in another function.
+			static constexpr const bool inserter_takes_context = node::detail_::kind_traits<Kind>::inserter_takes_context;
+			return template_::detail_::build_params_and_insert_node_wrapper<Kind>(
+				detail_::bool_constant<inserter_takes_context>{}, handle(),
+				::std::forward<T>(arg), ::std::forward<Ts>(node_params_ctor_arguments)...);
+		}
+	}; // insert_t
+
+	/**
+	 * @brief A gadget through which nodes are enqueued into the graph template
+	 *
+	 * @note Consider replacing with a single method which takes a reflection-enum,
+	 * e.g. `my_stream.insert(node::type_t::copy, param1, param2)`
+	 */
+	class delete_t {
+	protected:
+		const template_t &associated_template;
+		handle_type handle() const noexcept { return associated_template.handle(); }
+
+	public:
+		delete_t(const template_t &template_) : associated_template(template_) {}
+
+		void node(node_ref_type node) const
+		{
+			auto status = cuGraphDestroyNode(node.handle());
+			throw_if_error_lazy(status, "Deleting " + node::detail_::identify(node)
+				+ " in " + template_::detail_::identify(associated_template));
+		}
+
+		void edge(edge_type const& edge_) const
+		{
+			struct {
+				const node::handle_t source;
+				const node::handle_t dest;
+			} handles { edge_.first.handle(), edge_.second.handle() };
+			static constexpr const size_t remove_just_one = 1;
+			auto status = cuGraphRemoveDependencies(
+				handle(), &handles.source, &handles.dest, remove_just_one);
+
+			throw_if_error_lazy(status, "Destroying " + node::detail_::identify(edge_)
+										+ " in " + template_::detail_::identify(associated_template));
+		}
+
+		void edges(span<const node_ref_type> sources, span<const node_ref_type> destinations) const
+		{
+			if (sources.size() != destinations.size()) {
+				throw ::std::invalid_argument(
+					"Differing number of source nodes and destination nodes ("
+					+ ::std::to_string(sources.size()) + " != " + ::std::to_string(destinations.size())
+					+ " in a request to insert edges into " + template_::detail_::identify(associated_template) );
+			}
+			auto status = template_::detail_::delete_edges(handle(), sources, destinations);
+
+			throw_if_error_lazy(status, "Destroying " + ::std::to_string(sources.size()) + " edges in "
+				+ template_::detail_::identify(associated_template));
+		}
+
+		void edges(span<edge_type> edges) const
+		{
+			auto status = template_::detail_::delete_edges(handle(), edges);
+
+			throw_if_error_lazy(status, "Destroying " + ::std::to_string(edges.size()) + " edges in "
+				+ template_::detail_::identify(associated_template));
+		}
+	}; // delete_t
+
+public: // friendship
+
+	friend template_t template_::wrap(handle_type  handle, bool take_ownership) noexcept;
+
+protected: // constructors
+	template_t(handle_type handle, bool owning) noexcept
+	: handle_(handle), owning_(owning)
+	{ }
+
+public: // ctors & dtor
+	template_t(const template_t& other) noexcept = delete;
+	template_t(template_t&& other) noexcept : template_t(other.handle_, other.owning_)
+	{
+		other.owning_ = false;
+	}
+
+	~template_t() noexcept(false)
+	{
+		if (owning_) {
+			auto status = cuGraphDestroy(handle_);
+			throw_if_error_lazy(status, "Destroying " + template_::detail_::identify(*this));
+		}
+	}
+
+public: // operators
+	template_t& operator=(const template_t&) = delete;
+	template_t& operator=(template_t&& other) noexcept
+	{
+		::std::swap(handle_, other.handle_);
+		::std::swap(owning_, other.owning_);
+		return *this;
+	}
+
+public: // non-mutators
+	instance_t instantiate(
+#if CUDA_VERSION >= 11000
+		bool free_previous_allocations_before_relaunch = false
+#endif
+#if CUDA_VERSION >= 11700
+		, bool use_per_node_priorities = false
+#endif
+#if CUDA_VERSION >= 12000
+		, bool upload_on_instantiation = false
+		, bool make_device_launchable = false
+#endif
+	);
+
+public: // data members
+	const insert_t insert { *this };
+	const delete_t delete_ { *this };
+private: // data members
+	// Note: A CUDA graph template is not specific to a context, nor a device!
+	template_::handle_t handle_;
+	bool owning_;
+}; // class template_t
+
+namespace template_ {
+
+inline template_t wrap(handle_t handle, bool take_ownership) noexcept
+{
+	return { handle, take_ownership };
+}
+
+inline template_t create()
+{
+	constexpr const unsigned flags { 0 };
+	handle_t handle;
+	auto status = cuGraphCreate(&handle, flags);
+	throw_if_error_lazy(status, "Creating a CUDA graph");
+	return wrap(handle, do_take_ownership);
+}
+
+inline ::std::string identify(const template_t& template_)
+{
+	return "CUDA execution graph template at " + cuda::detail_::ptr_as_hex(template_.handle());
+}
+
+constexpr const ::std::initializer_list<node_t> no_dependencies {};
+
+template <node::kind_t Kind, template <typename> class Container, typename NodeOrHandle, typename... NodeParametersCtorParams>
+node::typed_node_t<Kind> insert_node(
+	const template_t& graph,
+	Container<NodeOrHandle> dependencies,
+	NodeParametersCtorParams... node_parameters_ctor_params)
+{
+	using traits_type = typename node::detail_::kind_traits<Kind>;
+	node::parameters_t<Kind> params { std::forward<NodeParametersCtorParams>(node_parameters_ctor_params)... };
+	auto raw_params = traits_type::marshal(params);
+	auto untyped_node = template_::detail_::insert_node(graph.handle(), raw_params, dependencies);
+	return node::wrap<Kind>(untyped_node.containing_graph(), untyped_node.handle(), params);
+	// Remember: untyped_node is not an owning object, so nothing is released (nor
+	// is ownership passed in the returned typed_node
+}
+
+} // namespace template_
+
+inline template_t create()
+{
+	return template_::create();
+}
+
+
+/**
+ * @brief Searches for the copy of a graph (template) node in another graph
+ * (template), which had originally been cloned from the graph in which the
+ * node is situated.
+ *
+ * @param node A node existing in some (implicit) graph template.
+ * @param cloned_graph A CUDA execution graph template, originally cloned from the
+ * graph containing @p node
+ * @return A reference to the copy of @p node in @p clone, if one exists,
+ * or a disengaged optional otherwise.
+ *
+ * TODO: Implement this in the multi-wrapper impls directory, or just in
+ * the dire
+ */
+inline optional<node_t> find_in_clone(node_t node, const template_t& cloned_graph)
+{
+	// The find function sets the result to 0 (nullptr) if the input
+	// parameters were valid, but the node was not found
+	auto search_result = reinterpret_cast<node::handle_t>(0x1);
+	auto status = cuGraphNodeFindInClone(&search_result, node.handle(), cloned_graph.handle());
+	if (status == cuda::status::invalid_value and search_result != nullptr) {
+		return nullopt;
+	}
+	throw_if_error_lazy(status, "Searching for a copy of " + node::detail_::identify(node) + " in " + template_::detail_::identify(cloned_graph));
+	return node::wrap(cloned_graph.handle(), search_result);
+}
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif // CUDA_API_WRAPPERS_GRAPH_TEMPLATE_HPP
diff --git a/src/cuda/api/graph/typed_node.hpp b/src/cuda/api/graph/typed_node.hpp
new file mode 100644
index 00000000..7b21a8f4
--- /dev/null
+++ b/src/cuda/api/graph/typed_node.hpp
@@ -0,0 +1,558 @@
+/**
+ * @file
+ *
+ * @brief
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_TYPED_NODE_HPP
+#define CUDA_API_WRAPPERS_TYPED_NODE_HPP
+
+#if CUDA_VERSION >= 10000
+
+#include "node.hpp"
+#include "../detail/for_each_argument.hpp"
+#include "../error.hpp"
+#include "../device.hpp"
+#include "../event.hpp"
+#include "../kernel.hpp"
+#include "../launch_configuration.hpp"
+#include "../memory_pool.hpp"
+
+#include <vector>
+#include <iostream>
+#include <utility>
+#include <type_traits>
+
+
+namespace cuda {
+
+namespace graph {
+
+/**
+ * Creates a raw structure of pointers to kernel arguments, which may then be used with a
+ * @ref kernel_t and @ref launch_config_t to define a CUDA execution graph kernel node
+ * @param kernel_arguments references - not pointers - to the kernel arguments.
+ * @return A value-type structure containing the pointers; it must be kept alive while
+ * the execution graph template (or any of its instances) are alive.
+ */
+template<typename... Ts>
+::std::vector<void*> make_kernel_argument_pointers(const Ts&... kernel_arguments)
+{
+	// The extra curly brackets mean the vector constructor gets an ::std::initializer_list<void*>;
+	// Note that we don't add an extern terminating NULLPTR - the Graph API is different than
+	// the traditional kernel launch API in that respect
+	return { { const_cast<void *>(reinterpret_cast<const void*>(&kernel_arguments)) ... } };
+}
+
+namespace node {
+
+enum class kind_t : ::std::underlying_type<CUgraphNodeType>::type {
+	kernel_launch = CU_GRAPH_NODE_TYPE_KERNEL,
+	memory_copy = CU_GRAPH_NODE_TYPE_MEMCPY, memcpy = memory_copy,
+	memory_set = CU_GRAPH_NODE_TYPE_MEMSET, memset = memory_set,
+	host_function_call = CU_GRAPH_NODE_TYPE_HOST,
+	child_graph = CU_GRAPH_NODE_TYPE_GRAPH,
+	empty = CU_GRAPH_NODE_TYPE_EMPTY,
+#if CUDA_VERSION >= 11000
+	wait = CU_GRAPH_NODE_TYPE_WAIT_EVENT, wait_on_event = wait,
+	event = CU_GRAPH_NODE_TYPE_EVENT_RECORD, record_event = event,
+#endif // CUDA_VERSION >= 11000
+// Sempaphores are not yet supported
+//	signal_semaphore = CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL,
+//	wait_on_semaphore = CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT,
+#if CUDA_VERSION >= 11000
+	memory_allocation = CU_GRAPH_NODE_TYPE_MEM_ALLOC, malloc = memory_allocation,
+	memory_free = CU_GRAPH_NODE_TYPE_MEM_FREE, memfree = memory_free,
+#endif // CUDA_VERSION >= 11000
+#if CUDA_VERSION >= 11070
+    // Note: We support just one batch op, but the kind value is for any batch op
+	memory_barrier = CU_GRAPH_NODE_TYPE_BATCH_MEM_OP,
+#endif
+#if CUDA_VERSION >= 12030
+	conditional = CU_GRAPH_NODE_TYPE_CONDITIONAL
+#endif // CUDA_VERSION >= 12030
+};
+
+#if CUDA_VERSION >= 12030
+namespace conditional {
+
+using handle_t = CUgraphConditionalHandle;
+using kind_t = CUgraphConditionalNodeType;
+using default_value_t = unsigned;
+
+namespace detail_ {
+
+using flags_t = unsigned;
+
+inline handle_t create(
+	template_::handle_t  graph_template,
+	context::handle_t    context_handle,
+	default_value_t      default_value)
+{
+	handle_t result;
+	auto status = cuGraphConditionalHandleCreate(
+		&result, graph_template, context_handle, default_value, CU_GRAPH_COND_ASSIGN_DEFAULT);
+	throw_if_error_lazy(status, "Failed creating a graph conditional");
+	return result;
+}
+
+} // namespace detail_
+
+struct parameters_t {
+	kind_t kind;
+	optional<handle_t> handle;
+	optional<context::handle_t> context_handle; // Note this must match the context used to create the graph template handle
+	optional<template_::handle_t> graph_template_handle;
+	optional<default_value_t> default_value; // Ignored in the handle has already been created
+};
+
+} // namespace conditional
+#endif // CUDA_VERSION >= 12030
+
+namespace detail_ {
+
+// Note: Eventually, when could drop support for CUDA graphs for versions
+// before 12.3 - and that would let us drop most of these structures and
+// just use cuGraphAddNode, with marshalling of parameters into a
+// CUgraphNodeParams.
+template<kind_t Kind>
+struct kind_traits;
+
+template<>
+struct kind_traits<kind_t::empty> {
+	static constexpr const auto name = "empty";
+#if CUDA_VERSION >= 12030
+	using raw_parameters_type = CUgraphNodeParams;
+	static constexpr const bool inserter_takes_context = false; // The context is passed within the parameter struct
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	using parameters_type = cuda::nullopt_t;
+
+	static constexpr const auto inserter = cuGraphAddNode;
+	static constexpr const auto setter = cuGraphNodeSetParams;
+	static constexpr const auto getter = nullptr;
+
+	static raw_parameters_type marshal(const parameters_type&) {
+		raw_parameters_type raw_params;
+		::std::memset(&raw_params, 0, sizeof(raw_parameters_type));
+		raw_params.type = static_cast<CUgraphNodeType>(kind_t::empty);
+		return raw_params;
+	}
+#endif // CUDA_VERSION >= 12030
+};
+
+template<>
+struct kind_traits<kind_t::child_graph> {
+	static constexpr const auto name = "child graph";
+	using raw_parameters_type = template_::handle_t;
+	static constexpr const bool inserter_takes_params_by_ptr = false;
+	static constexpr const bool inserter_takes_context = false;
+	using parameters_type = template_t;
+	static constexpr const auto inserter = cuGraphAddChildGraphNode; // 1 extra param
+	// no param setter!
+	static constexpr const auto getter = cuGraphChildGraphNodeGetGraph;
+#if CUDA_VERSION >= 11000
+	static constexpr const auto instance_setter = cuGraphExecChildGraphNodeSetParams;
+#endif
+
+	static raw_parameters_type marshal(const parameters_type& params);
+};
+
+#if CUDA_VERSION >= 11000
+template<>
+struct kind_traits<kind_t::record_event> {
+	static constexpr const auto name = "record event";
+	using raw_parameters_type = event::handle_t;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = false;
+	using parameters_type = event_t;
+	static constexpr const auto inserter = cuGraphAddEventRecordNode; // 1 extra param
+	static constexpr const auto setter = cuGraphEventRecordNodeSetEvent;
+	static constexpr const auto getter = cuGraphEventRecordNodeGetEvent;
+	static constexpr const auto instance_setter = cuGraphExecEventRecordNodeSetEvent;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		return params.handle();
+	}
+};
+
+template<>
+struct kind_traits<kind_t::wait_on_event> {
+	static constexpr const auto name = "wait on event";
+	using raw_parameters_type = event::handle_t;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = false;
+	using parameters_type = event_t;
+	static constexpr const auto inserter = cuGraphAddEventWaitNode; // 1 extra param
+	static constexpr const auto setter = cuGraphEventWaitNodeSetEvent;
+	static constexpr const auto getter = cuGraphEventWaitNodeGetEvent;
+	static constexpr const auto instance_setter = cuGraphExecEventWaitNodeSetEvent;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		return params.handle();
+
+	}
+};
+#endif // CUDA_VERSION >= 11000
+
+template<>
+struct kind_traits<kind_t::host_function_call> {
+	static constexpr const auto name = "host function call";
+	using raw_parameters_type = CUDA_HOST_NODE_PARAMS;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	struct parameters_type {
+		stream::callback_t function_ptr;
+		void* user_data;
+	};
+	static constexpr const auto inserter = cuGraphAddHostNode; // 1 extra param
+	static constexpr const auto setter = cuGraphHostNodeSetParams;
+	static constexpr const auto getter = cuGraphHostNodeGetParams;
+	static constexpr const auto instance_setter = cuGraphExecHostNodeSetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		return { params.function_ptr, params.user_data };
+	}
+	static raw_parameters_type marshal(::std::pair<stream::callback_t, void*> param_pair)
+	{
+		return { param_pair.first, param_pair.second };
+	}
+};
+
+template<>
+struct kind_traits<kind_t::kernel_launch> {
+	static constexpr const auto name = "kernel launch";
+	using raw_parameters_type = CUDA_KERNEL_NODE_PARAMS;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	struct parameters_type {
+		kernel_t kernel;
+		launch_configuration_t launch_config;
+		::std::vector<void*> marshalled_arguments; // Does _not_ need a nullptr "argument" terminator
+	};
+	static constexpr const auto inserter = cuGraphAddKernelNode; // 1 extra param
+	static constexpr const auto setter = cuGraphKernelNodeSetParams;
+	static constexpr const auto getter = cuGraphKernelNodeGetParams;
+	static constexpr const auto instance_setter = cuGraphExecKernelNodeSetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		// With C++20, use designated initializers
+		raw_parameters_type raw_params;
+		// Note: With CUDA versions earlier than 11.0, this next instruction
+		// only supports non-apriori-compiled kernels, i.e. ones you compile
+		// dynamically or load from a module. To change this,; we woulds need
+		// to distinguish the inserter, raw parameters and marshal members
+		// between apriori and non-apriori kernels
+		raw_params.func = params.kernel.handle();
+
+		raw_params.gridDimX = params.launch_config.dimensions.grid.x;
+		raw_params.gridDimY = params.launch_config.dimensions.grid.y;
+		raw_params.gridDimZ = params.launch_config.dimensions.grid.z;
+		raw_params.blockDimX = params.launch_config.dimensions.block.x;
+		raw_params.blockDimY = params.launch_config.dimensions.block.y;
+		raw_params.blockDimZ = params.launch_config.dimensions.block.z;
+		raw_params.sharedMemBytes = params.launch_config.dynamic_shared_memory_size;
+		raw_params.kernelParams = const_cast<decltype(raw_params.kernelParams)>(params.marshalled_arguments.data());
+		raw_params.extra = nullptr; // we don't use "extra"
+		return raw_params;
+	}
+};
+
+#if CUDA_VERSION >= 11020
+template<>
+struct kind_traits<kind_t::memory_allocation> {
+	static constexpr const auto name = "memory allocation";
+	using raw_parameters_type = CUDA_MEM_ALLOC_NODE_PARAMS;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	using parameters_type = ::std::pair<device_t, size_t>; // for now, assuming no peer access
+	// no setter
+	static constexpr const auto inserter = cuGraphAddMemAllocNode; // 1 extra param
+	// static constexpr const auto setter;
+	static constexpr const auto getter = cuGraphMemAllocNodeGetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		static constexpr const auto no_export_handle_kind = memory::pool::shared_handle_kind_t::no_export;
+		raw_parameters_type raw_params;
+		raw_params.poolProps = memory::pool::detail_::create_raw_properties<no_export_handle_kind>(params.first.id());
+		// TODO: DO we need to specify the allocation location in an access descriptor?
+		raw_params.accessDescs = nullptr; // for now, assuming no peer access
+		raw_params.accessDescCount = 0; // for now, assuming no peer access
+		raw_params.bytesize = params.second;
+		// TODO: Is the following really necessary to set to 0? Not sure, so let's
+		// do it as a precaution
+		raw_params.dptr = memory::device::address(nullptr);
+		return raw_params;
+	}
+};
+#endif // CUDA_VERSION >= 11020
+
+template<>
+struct kind_traits<kind_t::memory_set> {
+	// TODO: Consider support for pitched memory; we would probably need mdspans for that...
+	static constexpr const auto name = "memory set";
+	using raw_parameters_type = CUDA_MEMSET_NODE_PARAMS;
+	static constexpr const bool inserter_takes_context = true;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	struct parameters_type {
+		memory::region_t region;
+		size_t width_in_bytes; // cannot exceed 4
+		unsigned value; // must fit inside `width_in_bytes`, i.e. not exceed 2^(width*8) -1
+	};
+	static constexpr const auto inserter = cuGraphAddMemsetNode; // 2 extra params incl context
+	// no setter
+	static constexpr const auto getter = cuGraphMemsetNodeGetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		static constexpr const size_t max_width = sizeof(parameters_type::value);
+		if (params.width_in_bytes > max_width) {
+			throw ::std::invalid_argument("Unsupported memset value width (maximum is " + ::std::to_string(max_width));
+		}
+		const unsigned long min_overwide_value = 1lu << (params.width_in_bytes * CHAR_BIT);
+		if (static_cast<unsigned long>(params.value) >= min_overwide_value) {
+			throw ::std::invalid_argument("Memset value exceeds specified width");
+		}
+		CUDA_MEMSET_NODE_PARAMS raw_params;
+		raw_params.dst = memory::device::address(params.region.data());
+		// Not using raw_params.pitch since height is 1
+		raw_params.height = 1;
+		raw_params.value = params.value;
+		raw_params.elementSize = params.width_in_bytes;
+		raw_params.width = params.region.size() / params.width_in_bytes;
+		return raw_params;
+	}
+};
+
+#if CUDA_VERSION >= 11000
+template<>
+struct kind_traits<kind_t::memory_free> {
+	static constexpr const auto name = "memory free";
+	using raw_parameters_type = CUdeviceptr;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = false; // the void* _is_ the parameter
+	using parameters_type = void*;
+	static constexpr const auto inserter = cuGraphAddMemFreeNode; // 1 extra param
+	// no setter
+	// static constexpr const auto setter = ;
+	static constexpr const auto getter = cuGraphMemFreeNodeGetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		return memory::device::address(params);
+	}
+};
+#endif // CUDA_VERSION >= 11000
+
+#if CUDA_VERSION >= 11070
+template<>
+struct kind_traits<kind_t::memory_barrier> {
+	static constexpr const auto name = "memory barrier";
+	using raw_parameters_type = CUDA_BATCH_MEM_OP_NODE_PARAMS;
+	static constexpr const bool inserter_takes_context = false;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	using parameters_type = ::std::pair<context_t, cuda::memory::barrier_scope_t>;
+	static constexpr const auto inserter = cuGraphAddBatchMemOpNode; // 1 extra param
+	static constexpr const auto setter = cuGraphBatchMemOpNodeSetParams;
+	static constexpr const auto getter = cuGraphBatchMemOpNodeGetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params)
+	{
+		auto const & context = params.first;
+		raw_parameters_type raw_params;
+		raw_params.count = 1;
+		raw_params.ctx = context.handle();
+		raw_params.flags = 0;
+		CUstreamBatchMemOpParams memory_barrier_op;
+		memory_barrier_op.operation = CU_STREAM_MEM_OP_BARRIER;
+		memory_barrier_op.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
+		auto const & scope = params.second;
+		memory_barrier_op.memoryBarrier.flags = static_cast<unsigned>(scope);
+		raw_params.paramArray = &memory_barrier_op;
+		return raw_params;
+	}
+};
+#endif // CUDA_VERSION >= 11070
+
+template<>
+struct kind_traits<kind_t::memory_copy> {
+	static constexpr const auto name = "memory copy";
+	using raw_parameters_type = CUDA_MEMCPY3D;
+	static constexpr const bool inserter_takes_context = true;
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	static constexpr const dimensionality_t num_dimensions { 3 };
+	using parameters_type = memory::copy_parameters_t<3>;
+
+	static constexpr const auto inserter = cuGraphAddMemcpyNode; // 2 extra params incl. context
+	static constexpr const auto setter = cuGraphMemcpyNodeSetParams;
+	static constexpr const auto getter = cuGraphMemcpyNodeGetParams;
+
+	static raw_parameters_type marshal(const parameters_type& params) {
+		auto& params_ptr = const_cast<parameters_type&>(params);
+		// This is quite a nasty bit of voodoo. The thing is, CUDA_MEMCPY3D_PEER is
+		// the plain struct serving as the base class of copy_parameters_t<3>; and
+		// CUDA_MEMCPY3D and CUDA_MEMCPY3D_PEER are "punnable", i.e. they have the
+		// same size and field layout, with the former having some "reserved" fields
+		// which are not to be used, that in the latter are used for contexts.
+		return reinterpret_cast<CUDA_MEMCPY3D&>(params_ptr);
+	}
+};
+
+#if CUDA_VERSION >= 12040
+template<>
+struct kind_traits<kind_t::conditional> {
+	static constexpr const auto name = "conditional";
+	using raw_parameters_type = CUgraphNodeParams; // within which, CUDA_CONDITIONAL_NODE_PARAMS will be used
+	static constexpr const bool inserter_takes_context = false; // The context is passed within the parameter struct
+	static constexpr const bool inserter_takes_params_by_ptr = true;
+	using parameters_type = conditional::parameters_t;
+
+	static constexpr const auto inserter = cuGraphAddNode;
+	static constexpr const auto setter = cuGraphNodeSetParams;
+	static constexpr const auto getter = nullptr;
+
+	static raw_parameters_type marshal(const parameters_type& params) {
+		raw_parameters_type raw_params;
+		::std::memset(&raw_params, 0, sizeof(raw_parameters_type));
+		raw_params.type = static_cast<CUgraphNodeType>(kind_t::conditional);
+		raw_params.conditional.type = params.kind;
+		raw_params.conditional.ctx = params.context_handle ?
+									 params.context_handle.value() : context::current::detail_::get_handle();
+		if (not params.handle and
+		   not (params.graph_template_handle and params.context_handle and params.default_value)) {
+			throw ::std::invalid_argument(
+				"Conditional node creation parameters specify neither a pre-existing conditional handle, "
+				"nor the arguments required for its creation");
+		}
+		raw_params.conditional.handle = params.handle ? params.handle.value() :
+			conditional::detail_::create(
+				params.graph_template_handle.value(), params.context_handle.value(), params.default_value.value());
+		raw_params.conditional.size = 1;
+		return raw_params;
+	}
+};
+#endif // CUDA_VERSION >= 12300
+
+template <kind_t Kind, typename = typename ::std::enable_if<kind_traits<Kind>::inserter_takes_params_by_ptr>::type>
+typename kind_traits<Kind>::raw_parameters_type *
+maybe_add_ptr(const typename kind_traits<Kind>::raw_parameters_type& raw_params)
+{
+	return const_cast<typename kind_traits<Kind>::raw_parameters_type *>(&raw_params);
+}
+
+template <kind_t Kind, typename = typename ::std::enable_if<not kind_traits<Kind>::inserter_takes_params_by_ptr>::type>
+const typename kind_traits<Kind>::raw_parameters_type&
+maybe_add_ptr(const typename kind_traits<Kind>::raw_parameters_type& raw_params) { return raw_params; }
+
+} // namespace detail_
+
+template <kind_t Kind> using parameters_t = typename detail_::kind_traits<Kind>::parameters_type;
+
+template <kind_t Kind>
+class typed_node_t;
+
+template <kind_t Kind>
+typed_node_t<Kind> wrap(template_::handle_t graph_handle, handle_t handle, parameters_t<Kind> parameters) noexcept;
+
+template <kind_t Kind>
+class typed_node_t : public node_t {
+	using parameters_type = parameters_t<Kind>;
+protected:
+	using traits = detail_::kind_traits<Kind>;
+	using raw_parameters_type = typename traits::raw_parameters_type;
+	static constexpr char const * const name = traits::name;
+
+public:
+	/**
+	 * @return The _cached_ node parameters known by this object since its construction.
+	 */
+	const parameters_type& parameters() const noexcept
+	{
+		static_assert(Kind != kind_t::empty, "Empty CUDA graph nodes don't have parameters");
+		return params_;
+	}
+
+	parameters_type requery_parameters() const
+	{
+		typename traits::raw_parameters_type raw_params;
+		if (traits::param_getter == nullptr) {
+			throw cuda::runtime_error(status::named_t::not_supported,
+				"Querying parameters is not supported for this kind of node: " + node::detail_::identify(*this));
+		}
+		auto status = traits::param_getter(handle(), &raw_params);
+		throw_if_error_lazy(status, "setting parameters for " + node::detail_::identify(*this));
+		params_ = traits::unmarshal(raw_params);
+		return params_;
+	}
+
+	void set_parameters(parameters_t<Kind> parameters)
+	{
+		static_assert(Kind != kind_t::empty, "Empty CUDA graph nodes don't have parameters");
+		auto marshalled_params = traits::marshal(parameters);
+		auto status = traits::param_setter(handle(), &marshalled_params);
+		throw_if_error_lazy(status, "setting parameters for " + node::detail_::identify(*this));
+	}
+
+public: // friendship
+	friend typed_node_t node::wrap<Kind>(template_::handle_t graph_handle, node::handle_t handle, parameters_t<Kind> parameters) noexcept;
+
+protected: // constructors and destructors
+	typed_node_t(template_::handle_t graph_template_handle, handle_type handle, parameters_type parameters) noexcept
+	: node_t(graph_template_handle, handle), params_(::std::move(parameters)) { }
+
+public:  // constructors and destructors
+	typed_node_t(const typed_node_t<Kind>&) = default; // It's a reference type, so copying is not a problem
+	typed_node_t(typed_node_t<Kind>&&) noexcept = default; // It's a reference type, so copying is not a problem
+
+	typed_node_t<Kind>& operator=(typed_node_t<Kind> other) noexcept
+	{
+		node_t::operator=(other);
+		params_ = other.params_;
+	}
+
+protected: // data members
+	mutable parameters_t<Kind> params_;
+};
+
+template <kind_t Kind>
+typed_node_t<Kind> wrap(template_::handle_t graph_handle, handle_t handle, parameters_t<Kind> parameters) noexcept
+{
+	return typed_node_t<Kind>{ graph_handle, handle, ::std::move(parameters) };
+}
+
+} // namespace node
+
+inline node::parameters_t<node::kind_t::kernel_launch>
+make_launch_primed_kernel(
+	kernel_t kernel,
+	launch_configuration_t launch_config,
+	const ::std::vector<void*>& argument_pointers)
+{
+	return { ::std::move(kernel), ::std::move(launch_config), ::std::move(argument_pointers) };
+}
+
+template <typename... KernelParameters>
+node::parameters_t<node::kind_t::kernel_launch>
+make_launch_primed_kernel(
+	kernel_t kernel,
+	launch_configuration_t launch_config,
+	const KernelParameters&... kernel_arguments)
+{
+	return {
+		::std::move(kernel),
+		::std::move(launch_config),
+		make_kernel_arg_ptrs(kernel_arguments...)
+	};
+}
+
+} // namespace graph
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif //CUDA_API_WRAPPERS_TYPED_NODE_HPP
diff --git a/src/cuda/api/memory_pool.hpp b/src/cuda/api/memory_pool.hpp
index 5653728e..25eb0059 100644
--- a/src/cuda/api/memory_pool.hpp
+++ b/src/cuda/api/memory_pool.hpp
@@ -40,7 +40,11 @@ inline CUmemLocation create_mem_location(cuda::device::id_t device_id) noexcept
 	return result;
 }
 
+#if CUDA_VERSION >= 11020
+template<pool::shared_handle_kind_t SharedHandleKind = pool::shared_handle_kind_t::no_export>
+#else
 template<pool::shared_handle_kind_t SharedHandleKind>
+#endif
 CUmemPoolProps create_raw_properties(cuda::device::id_t device_id) noexcept
 {
 	CUmemPoolProps result;
diff --git a/src/cuda/api/multi_wrapper_impls/graph.hpp b/src/cuda/api/multi_wrapper_impls/graph.hpp
new file mode 100644
index 00000000..cdb67f72
--- /dev/null
+++ b/src/cuda/api/multi_wrapper_impls/graph.hpp
@@ -0,0 +1,230 @@
+/**
+ * @file
+ *
+ * @brief Implementations requiring the definitions of multiple CUDA entity proxy classes,
+ * which regard CUDA (execution) graphs.
+ */
+#pragma once
+#ifndef MULTI_WRAPPER_IMPLS_GRAPHS_HPP_
+#define MULTI_WRAPPER_IMPLS_GRAPHS_HPP_
+
+#include "../device.hpp"
+#include "../pointer.hpp"
+#include "../memory.hpp"
+#include "../primary_context.hpp"
+#include "../stream.hpp"
+#include "../virtual_memory.hpp"
+#include "../kernel.hpp"
+#include "../event.hpp"
+#include "../kernels/apriori_compiled.hpp"
+#include "../current_context.hpp"
+#include "../graph/node.hpp"
+#include "../graph/template.hpp"
+#include "../graph/instance.hpp"
+#include "../graph/typed_node.hpp"
+
+#if CUDA_VERSION >= 10000
+
+namespace cuda {
+
+namespace graph {
+
+#if CUDA_VERSION >= 11000
+
+inline bool is_enabled_in(const node_t& node, const instance_t& instance)
+{
+	unsigned result;
+	auto status = cuGraphNodeGetEnabled(instance.handle(), node.handle(), &result);
+	throw_if_error_lazy(status, "Determining whether " + node::detail_::identify(node) + " is active in " + instance::detail_::identify(instance));
+	return (result == 1);
+}
+
+inline void set_enabled_in(const node_t& node, const instance_t& instance, bool enabled)
+{
+	auto status = cuGraphNodeSetEnabled(instance.handle(), node.handle(), enabled);
+	throw_if_error_lazy(status, "Enabling " + node::detail_::identify(node) + " in " + instance::detail_::identify(instance));
+}
+#endif // CUDA_VERSION >= 11000
+
+inline void launch(const instance_t& instance, const stream_t& stream)
+{
+	context::current::detail_::scoped_override_t set_context_for_current_scope(stream.context_handle());
+	auto status = cuGraphLaunch(instance.handle(), stream.handle());
+	throw_if_error_lazy(status, "Launching " + instance::detail_::identify(instance) + " on " + stream::detail_::identify(stream));
+}
+
+namespace instance {
+
+#if CUDA_VERSION >= 11000
+inline void upload(const instance_t& instance, const stream_t& stream)
+{
+	context::current::detail_::scoped_override_t set_context_for_current_scope(stream.context_handle());
+	auto status = cuGraphUpload(instance.handle(), stream.handle());
+	throw_if_error_lazy(status, "Uploading " + instance::detail_::identify(instance) + " on " + stream::detail_::identify(stream));
+}
+#endif // CUDA_VERSION >= 11000
+
+inline void update(const instance_t& destination, const template_t& source)
+{
+#if CUDA_VERSION < 12000
+	node::handle_t impermissible_node_handle{};
+	instance::update_status_t update_status;
+	auto status = cuGraphExecUpdate(destination.handle(), source.handle(), &impermissible_node_handle, &update_status);
+	if (is_failure(status)) {
+		throw instance::update_failure(update_status, node::wrap(source.handle(), impermissible_node_handle));
+	}
+#else
+	CUgraphExecUpdateResultInfo update_result_info;
+	auto status = cuGraphExecUpdate(destination.handle(), source.handle(), &update_result_info);
+	if (is_failure(status)) {
+		// TODO: Add support for reporting errors involving edges, not just single nodes
+		throw instance::update_failure(update_result_info.result, node::wrap(source.handle(), update_result_info.errorNode));
+	}
+#endif
+}
+
+} // namespace instance
+
+//inline void instance_t::launch(const stream_t& stream) const
+//{
+//	instance::launch(*this, stream);
+//}
+
+#if CUDA_VERSION >= 11000
+inline void instance_t::upload(const stream_t& stream) const
+{
+	instance::upload(*this, stream);
+}
+#endif
+
+inline instance_t template_t::instantiate(
+#if CUDA_VERSION >= 11000
+	bool free_previous_allocations_before_relaunch
+#endif
+#if CUDA_VERSION >= 11700
+	, bool use_per_node_priorities
+#endif
+#if CUDA_VERSION >= 12000
+	, bool upload_on_instantiation
+	, bool make_device_launchable
+#endif
+)
+{
+	return graph::instantiate(
+		*this
+#if CUDA_VERSION >= 11000
+		, free_previous_allocations_before_relaunch
+#endif
+#if CUDA_VERSION >= 11700
+		, use_per_node_priorities
+#endif
+#if CUDA_VERSION >= 12000
+		, upload_on_instantiation
+		, make_device_launchable
+#endif
+	);
+}
+
+namespace node {
+
+namespace detail_ {
+
+inline ::std::string identify(const node_t &node)
+{
+	return identify(node.handle(), node.containing_graph_handle());
+}
+
+inline auto kind_traits<kind_t::child_graph>::marshal(const parameters_type& params) -> raw_parameters_type
+{
+	return params.handle();
+}
+
+} // namespace detail_
+
+
+} // namespace node
+
+inline template_t node_t::containing_graph() const noexcept
+{
+	static constexpr const bool dont_take_ownership { false };
+	return template_::wrap(containing_graph_handle(), dont_take_ownership);
+}
+
+namespace instance {
+
+namespace detail_ {
+
+inline ::std::string describe(
+	instance::update_status_t  update_status,
+	node::handle_t             node_handle,
+	template_::handle_t        graph_template_handle)
+{
+	::std::string result = describe(update_status);
+	if (node_handle != node::no_handle) {
+		result += node::detail_::identify(node_handle, graph_template_handle);
+	}
+	return result;
+}
+
+/*
+inline ::std::string identify(instance::handle_t handle)
+{
+	return "execution graph instance at " + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(instance::handle_t handle, template_::handle_t template_handle)
+{
+	return identify(handle) + " within " + graph::template_::detail_::identify(template_handle);
+}
+
+inline ::std::string identify(const instance_t& instance)
+{
+	return identify(instance.handle(), instance.template_handle());
+}
+*/
+
+} // namespace detail_
+
+} // namespace instance
+
+namespace template_ {
+
+namespace detail_ {
+
+inline ::std::string identify(const template_t& graph_template)
+{
+	return identify(graph_template.handle());
+}
+
+} // namespace detail_
+
+} // namespace template_
+
+} // namespace graph
+
+inline ::std::string describe(graph::instance::update_status_t update_status, optional<graph::node_t> node)
+{
+	return node ?
+		   describe(update_status) :
+		   graph::instance::detail_::describe(update_status, node.value().handle(), node.value().containing_graph_handle());
+}
+
+inline graph::template_t stream_t::end_capture() const
+{
+	graph::template_::handle_t new_graph;
+	auto status = cuStreamEndCapture(handle_, &new_graph);
+	throw_if_error_lazy(status, "Completing the capture of operations into a graph on " + stream::detail_::identify(*this));
+	return graph::template_::wrap(new_graph, do_take_ownership);
+}
+
+inline void stream_t::enqueue_t::graph_launch(const graph::instance_t& graph_instance) const
+{
+	graph::launch(graph_instance, associated_stream);
+}
+
+} // namespace cuda
+
+#endif // CUDA_VERSION >= 10000
+
+#endif // MULTI_WRAPPER_IMPLS_GRAPHS_HPP_
+
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index 97ba6bf8..9266c7a3 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -18,6 +18,10 @@
 #include "miscellany.hpp"
 #include "types.hpp"
 
+#if CUDA_VERSION >= 10000
+#include "cuda/api/graph/template.hpp"
+#endif // CUDA_VERSION >= 10000
+
 #include <string>
 #include <memory>
 #include <utility>
@@ -205,6 +209,19 @@ CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value, unsig
 
 } // namespace detail_
 
+#if CUDA_VERSION >= 10000
+namespace capture {
+
+inline state_t state(const stream_t& stream);
+
+} // namespace capture
+
+inline bool is_capturing(const stream_t& stream)
+{
+	return is_capturing(stream::capture::state(stream));
+}
+
+#endif // CUDA_VERSION >= 10000
 } // namespace stream
 
 inline void synchronize(const stream_t& stream);
@@ -380,6 +397,17 @@ class stream_t {
 			cuda::launch_type_erased(kernel, associated_stream, launch_configuration, marshalled_arguments);
 		}
 
+#if CUDA_VERSION >= 10000
+		/**
+		 * Launch an instantiated execution graph template via the stream
+		 *
+		 * @note The launched graph executes asynchronously within its bounds;
+		 * synchronization via this stream only regards the beginning and end of
+		 * the full graph's execution.
+		 */
+		void graph_launch(const graph::instance_t& graph_instance) const;
+#endif // CUDA_VERSION >= 10000
+
 		/**
 		 * Copy operations
 		 *
@@ -429,6 +457,7 @@ class stream_t {
 		{
 			copy(destination, source, source.size());
 		}
+
 		///@}
 
 		/**
@@ -521,7 +550,7 @@ class stream_t {
 		{
 			// I hope you like function declaration punning :-)
 			stream::detail_::enqueue_function_call(
-				associated_stream, reinterpret_cast<stream::detail_::callback_t>(function), argument);
+				associated_stream, reinterpret_cast<stream::callback_t>(function), argument);
 		}
 #endif
 
@@ -537,7 +566,7 @@ class stream_t {
 		template <typename Invokable>
 		void host_invokable(Invokable& invokable) const
 		{
-			auto type_erased_invoker = reinterpret_cast<stream::detail_::callback_t>(stream_launched_invoker<Invokable>);
+			auto type_erased_invoker = reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
 			stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
 		}
 
@@ -815,6 +844,37 @@ class stream_t {
 	}
 #endif
 
+	// TODO: Create a dummy capture object, then we could have capture.start(), capture.stop(), capture.status(),
+	// and perhaps a capture_() which takes a lambda. Also offer a
+	// cuda::stream::capture(const stream_t& stream, F f) template!
+
+
+#if CUDA_VERSION >= 10000
+	/**
+	 * @brief begin a capture of enqueued operations for later generation of an execution graph
+	 *
+	 * @note See also @ref graph::template_t
+	 */
+	void begin_capture(stream::capture::mode_t mode) const {
+		context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle_);
+		auto status = cuStreamBeginCapture(handle_, static_cast<CUstreamCaptureMode>(mode));
+		throw_if_error_lazy(status, "Failed beginning to capture on " + stream::detail_::identify(*this));
+	}
+
+	/**
+	 * @return true if the stream is currently capturing enqueued operations for inclusion in an execution graph.
+	 */
+	bool is_capturing() const { return stream::is_capturing(*this); }
+
+	/**
+	 * @brief Complete the capture begun by the last invocation of the @ref begin_capture method
+	 *
+	 * @return A CUDA execution graph template, comprising of all operations enqueued on this stream
+	 * between the last invocation of @ref begin_capture and the invocation of this one.
+	 */
+	graph::template_t end_capture() const;
+#endif // CUDA_VERSION >= 10000
+
 protected: // constructor
 
 	stream_t(
@@ -1058,6 +1118,21 @@ stream_t create(
 	bool              hold_pc_refcount_unit = false);
 ///@}
 
+#if CUDA_VERSION >= 10000
+namespace capture {
+
+state_t state(const stream_t& stream)
+{
+	context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
+	CUstreamCaptureStatus capture_status;
+	auto op_status = cuStreamIsCapturing(stream.handle(), &capture_status);
+	throw_if_error_lazy(op_status, "Failed beginning to capture on " + stream::detail_::identify(stream));
+	return static_cast<state_t>(capture_status);
+}
+
+} // namespace capture
+#endif // CUDA_VERSION >= 10000
+
 } // namespace stream
 
 /**
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index 2af4a9fb..c2b0069f 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -237,7 +237,6 @@ enum : priority_t {
 	default_priority   = 0
 };
 
-namespace detail_ {
 
 /// The CUDA driver's raw handle for a host-side callback function
 #if CUDA_VERSION >= 10000
@@ -246,7 +245,33 @@ using callback_t = CUhostFn;
 using callback_t = CUstreamCallback;
 #endif
 
-} // namespace detail_
+#if CUDA_VERSION >= 10000
+
+namespace capture {
+
+enum class mode_t : ::std::underlying_type<CUstreamCaptureMode>::type {
+	global        = CU_STREAM_CAPTURE_MODE_GLOBAL,
+	thread        = CU_STREAM_CAPTURE_MODE_THREAD_LOCAL,
+	thread_local_ = thread,
+	relaxed       = CU_STREAM_CAPTURE_MODE_RELAXED
+};
+
+enum class state_t : ::std::underlying_type<CUstreamCaptureStatus>::type {
+	active        = CU_STREAM_CAPTURE_STATUS_ACTIVE,
+	capturing     = active,
+	invalidated   = CU_STREAM_CAPTURE_STATUS_INVALIDATED,
+	none          = CU_STREAM_CAPTURE_STATUS_NONE,
+	not_capturing = none
+};
+
+} // namespace capture
+
+inline bool is_capturing(capture::state_t status) noexcept
+{
+	return status == capture::state_t::active;
+}
+
+#endif // CUDA_VERSION >= 10000
 
 } // namespace stream
 
@@ -916,6 +941,55 @@ using handle_t = CUfunction;
 
 } // namespace kernel
 
+#if CUDA_VERSION >= 10000
+
+/**
+ * Functionality related to CUDA (execution) graphs, which can be scheduled
+ * for execution at once, rather than the default piecemeal execution of
+ * individual command and event-based dependencies.
+ */
+namespace graph {
+
+namespace node {
+
+/// Internal CUDA handle for a node in a(n execution) graph - whether a template or an executable instance
+using handle_t = CUgraphNode;
+/// Internal CUDA handle for a node in a(n execution) graph - whether a template or an executable instance
+using const_handle_t = CUgraphNode_st const *;
+
+constexpr const const_handle_t no_handle = nullptr;
+
+} // namespace node
+
+/**
+ * Functionality related to the @ref cuda::template_t class - the
+ * templates of CUDA execution graphs, which are what one constructs and edits,
+ * but then must instantiate for actual execution scheduling.
+ */
+namespace template_ {
+
+/// Internal CUDA driver handle of a(n execution) graph template; wrapped by @ref graph::template_t
+using handle_t = CUgraph;
+constexpr const handle_t null_handle = nullptr;
+
+} // namespace template_
+
+/**
+ * Functionality related to the @ref cuda::template_t class - the
+ * templates of CUDA execution graphs, which are what one constructs and edits,
+ * but then must instantiate for actual execution scheduling.
+ */
+namespace instance {
+
+/// Internal CUDA driver handle of an executable graph instance; wrapped by @ref graph::instance_t
+using handle_t = CUgraphExec;
+
+} // namespace instance
+
+} // namespace graph
+
+#endif // CUDA_VERSION >= 10000
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_COMMON_TYPES_HPP_