Skip to content

Commit

Permalink
Fix format
Browse files Browse the repository at this point in the history
  • Loading branch information
matmanc committed Jan 17, 2025
1 parent f23bd0d commit ae7acad
Show file tree
Hide file tree
Showing 4 changed files with 303 additions and 10 deletions.
152 changes: 143 additions & 9 deletions include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class Device : public Wrapper<CUdevice> {
}

std::string getArch() const {
#if defined(__HIP_PLATFORM_AMD__)
#if defined(__HIP__)
hipDeviceProp_t prop;
checkCudaCall(hipGetDeviceProperties(&prop, _ordinal));
return prop.gcnArchName;
Expand Down Expand Up @@ -704,6 +704,94 @@ class GraphKernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS> {
}
};

class GraphHostNodeParams : public Wrapper<CUDA_HOST_NODE_PARAMS> {
public:
GraphHostNodeParams(void (*fn)(void *), void *data) {
_obj.fn = fn;
_obj.userData = data;
}
};

class GraphDevMemAllocNodeParams : public Wrapper<CUDA_MEM_ALLOC_NODE_PARAMS> {
public:
GraphDevMemAllocNodeParams(const Device &dev, size_t size) {
_obj.bytesize = size;
_obj.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED;
_obj.poolProps.location.id = dev.getOrdinal();
_obj.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
}

const CUdeviceptr &getDevPtr() { return _obj.dptr; }

const void *parameter() { return DeviceMemory(_obj.dptr).parameter(); }

const DeviceMemory getDeviceMemory() {
return DeviceMemory(_obj.dptr, _obj.bytesize);
}
};

class GraphMemCopyToDeviceNodeParams : public Wrapper<CUDA_MEMCPY3D> {
public:
GraphMemCopyToDeviceNodeParams(const DeviceMemory &memory, const void *src,
size_t size_x, size_t size_y, size_t size_z) {
memset(&_obj, 0, sizeof(CUDA_MEMCPY3D));
_obj.srcMemoryType = CU_MEMORYTYPE_HOST;
_obj.dstMemoryType = CU_MEMORYTYPE_DEVICE;
_obj.srcHost = src;
_obj.dstDevice = static_cast<CUdeviceptr>(memory);
_obj.srcXInBytes = 0;
_obj.srcY = 0;
_obj.srcZ = 0;
_obj.dstXInBytes = 0;
_obj.dstY = 0;
_obj.dstZ = 0;

_obj.WidthInBytes = size_x;
_obj.Height = size_y;
_obj.Depth = size_z;
}

GraphMemCopyToDeviceNodeParams(const CUdeviceptr &dst, const void *src,
size_t size_x, size_t size_y, size_t size_z) {
memset(&_obj, 0, sizeof(CUDA_MEMCPY3D));
_obj.srcMemoryType = CU_MEMORYTYPE_HOST;
_obj.dstMemoryType = CU_MEMORYTYPE_DEVICE;
_obj.srcHost = src;
_obj.dstDevice = dst;
_obj.srcXInBytes = 0;
_obj.srcY = 0;
_obj.srcZ = 0;
_obj.dstXInBytes = 0;
_obj.dstY = 0;
_obj.dstZ = 0;

_obj.WidthInBytes = size_x;
_obj.Height = size_y;
_obj.Depth = size_z;
}
};

class GraphMemCopyToHostNodeParams : public Wrapper<CUDA_MEMCPY3D> {
public:
GraphMemCopyToHostNodeParams(void *dst, const CUdeviceptr &src, size_t size_x,
size_t size_y, size_t size_z) {
memset(&_obj, 0, sizeof(CUDA_MEMCPY3D));
_obj.srcMemoryType = CU_MEMORYTYPE_DEVICE;
_obj.dstMemoryType = CU_MEMORYTYPE_HOST;
_obj.srcDevice = src;
_obj.dstHost = dst;
_obj.srcXInBytes = 0;
_obj.srcY = 0;
_obj.srcZ = 0;
_obj.dstXInBytes = 0;
_obj.dstY = 0;
_obj.dstZ = 0;
_obj.WidthInBytes = size_x;
_obj.Height = size_y;
_obj.Depth = size_z;
}
};

class Graph : public Wrapper<CUgraph> {
public:
explicit Graph(CUgraph &graph) : Wrapper(graph) {};
Expand All @@ -717,10 +805,57 @@ class Graph : public Wrapper<CUgraph> {

void addKernelNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
const GraphKernelNodeParams &params) {
checkCudaCall(cuGraphAddKernelNode(node.getNode(), _obj,
dependencies.data(), dependencies.size(),
(CUDA_KERNEL_NODE_PARAMS *)(&params)));
GraphKernelNodeParams &params) {
checkCudaCall(cuGraphAddKernelNode(
node.getNode(), _obj, dependencies.data(), dependencies.size(),
reinterpret_cast<CUDA_KERNEL_NODE_PARAMS *>(&params)));
}

void addHostNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
GraphHostNodeParams &params) {
checkCudaCall(cuGraphAddHostNode(
node.getNode(), _obj, dependencies.data(), dependencies.size(),
reinterpret_cast<CUDA_HOST_NODE_PARAMS *>(&params)));
}

void addDevMemFreeNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
const CUdeviceptr &devPtr) {
checkCudaCall(cuGraphAddMemFreeNode(node.getNode(), _obj,
dependencies.data(),
dependencies.size(), devPtr));
}

void addDevMemAllocNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
GraphDevMemAllocNodeParams &params) {
checkCudaCall(cuGraphAddMemAllocNode(
node.getNode(), _obj, dependencies.data(), dependencies.size(),
reinterpret_cast<CUDA_MEM_ALLOC_NODE_PARAMS *>(&params)));
}

void addHostToDeviceMemCopyNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
GraphMemCopyToDeviceNodeParams &params,
const Context &ctx) {
checkCudaCall(cuGraphAddMemcpyNode(
node.getNode(), _obj, dependencies.data(), dependencies.size(),
reinterpret_cast<CUDA_MEMCPY3D *>(&params), ctx));
}

void addDeviceToHostMemCopyNode(GraphNode &node,
const std::vector<CUgraphNode> &dependencies,
GraphMemCopyToHostNodeParams &params,
const Context &ctx) {
checkCudaCall(cuGraphAddMemcpyNode(
node.getNode(), _obj, dependencies.data(), dependencies.size(),
reinterpret_cast<CUDA_MEMCPY3D *>(&params), ctx));
}

void exportDotFile(std::string path, CUgraphDebugDot_flags flags =
CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE) {
checkCudaCall(cuGraphDebugDotPrint(_obj, path.c_str(), flags));
}

CUgraphExec Instantiate(unsigned int flags = CU_GRAPH_DEFAULT) {
Expand All @@ -741,9 +876,9 @@ class GraphConditionalHandle : public Wrapper<CUgraphConditionalHandle> {
}
};

class WhileNodeParams : public Wrapper<CUgraphNodeParams> {
class GraphWhileNodeParams : public Wrapper<CUgraphNodeParams> {
public:
explicit WhileNodeParams(GraphConditionalHandle &conditional) {
explicit GraphWhileNodeParams(GraphConditionalHandle &conditional) {
_obj.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
_obj.conditional.handle = conditional;
_obj.conditional.size = 1;
Expand All @@ -754,12 +889,11 @@ class WhileNodeParams : public Wrapper<CUgraphNodeParams> {

void AddToGraph(Graph &graph, GraphNode &node,
const std::vector<CUgraphNode> &dependencies) {
checkCudaCall(cuGraphAddNode((CUgraphNode *)&node, graph,
checkCudaCall(cuGraphAddNode(reinterpret_cast<CUgraphNode *>(&node), graph,
dependencies.data(), dependencies.size(),
&_obj));
}
};

#endif

class GraphExec : public Wrapper<CUgraphExec> {
Expand Down
3 changes: 3 additions & 0 deletions include/cudawrappers/macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ typedef uint32_t cuuint32_t;
hipGraphInstantiateFlagUseNodePriority
#define CUDA_IPC_HANDLE_SIZE HIP_IPC_HANDLE_SIZE
#define CUDA_KERNEL_NODE_PARAMS hipKernelNodeParams
#define CUDA_HOST_NODE_PARAMS hipHostNodeParams
#define CUDA_MEM_ALLOC_NODE_PARAMS hipMemAllocNodeParams
#define CUDA_R_16BF HIP_R_16BF
#define CUDA_R_16F HIP_R_16F
#define CUDA_R_32F HIP_R_32F
Expand Down Expand Up @@ -662,6 +664,7 @@ typedef uint32_t cuuint32_t;
#define cuGetErrorString hipDrvGetErrorString
#define cuGetProcAddress hipGetProcAddress
#define cuGraphAddKernelNode hipGraphAddKernelNode
#define cuGraphAddHostNode hipGraphAddHostNode
#define cuGraphCreate hipGraphCreate
#define cuGraphDestroy hipGraphDestroy
#define cuGraphInstantiateWithFlags hipGraphInstantiateWithFlags
Expand Down
4 changes: 3 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ FetchContent_MakeAvailable(Catch2)
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/contrib)
include(Catch)

set(COMPONENTS cu nvrtc cufft vector_add)
set(COMPONENTS cu nvrtc cufft vector_add graph)
if(${CUDAWRAPPERS_BACKEND_CUDA})
list(APPEND COMPONENTS nvml)
endif()
Expand Down Expand Up @@ -43,3 +43,5 @@ target_link_libraries(test_cufft PUBLIC ${LINK_LIBRARIES} cudawrappers::cufft)
target_link_libraries(
test_vector_add PUBLIC ${LINK_LIBRARIES} cudawrappers::nvrtc
)

target_link_libraries(test_graph PUBLIC ${LINK_LIBRARIES} cudawrappers::nvrtc)
154 changes: 154 additions & 0 deletions tests/test_graph.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#include <array>
#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <cstring>
#include <iostream>
#include <string>

#include <cudawrappers/cu.hpp>
#include <cudawrappers/nvrtc.hpp>

TEST_CASE("Test cu::Graph", "[graph]") {
const std::string kernel = R"(
extern "C" __global__ void vector_print(float *a, size_t array_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < 3) {
printf("a[%d] = %f\n", i, a[i]);
a[i] = 42.0f * 2.0f;
}
}
)";

cu::init();
cu::Device device(0);
cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);
nvrtc::Program program(kernel, "kernel.cu");
program.compile({});
cu::Module module(static_cast<const void*>(program.getPTX().data()));

cu::Stream stream;

SECTION("Test cu::hostNode single value") {
auto fn = [](void* data) {
int* ptr = static_cast<int*>(data);
std::cout << "Host node content: " << *ptr << std::endl;
*ptr += 1;
};
cu::Graph graph;
int data = 42;
cu::GraphHostNodeParams node_params(fn, &data);

cu::GraphNode node1, node2;
graph.addHostNode(node1, {}, node_params);
graph.addHostNode(node2, {node1}, node_params);

cu::GraphExec graph_exec(graph);
cu::Stream stream;
stream.lunchGraph(graph_exec);
stream.synchronize();

CHECK(data == 44);
}
SECTION("Test cu::Graph : memory management") {
std::array<float, 3> data_in{3, 3, 3};
std::array<float, 3> data_out{0, 0, 0};
std::cout << " Running on device : " << device.getOrdinal() << std::endl;
cu::Graph graph;
cu::GraphNode dev_alloc, host_set, copy_to_dev, execute_kernel, device_free,
copy_to_host;

auto set_value = [](void* data) {
float* ptr = static_cast<float*>(data);
for (int i = 0; i < 3; i++) {
ptr[i] = 42.0f;
}
};
cu::GraphHostNodeParams host_set_params{set_value, data_in.data()};
cu::GraphDevMemAllocNodeParams dev_alloc_params{device, sizeof(data_in)};

graph.addHostNode(host_set, {}, host_set_params);
graph.addDevMemAllocNode(dev_alloc, {}, dev_alloc_params);

cu::GraphMemCopyToDeviceNodeParams copy_to_dev_params{
dev_alloc_params.getDevPtr(), data_in.data(), sizeof(data_in), 1, 1};
cu::GraphMemCopyToHostNodeParams copy_to_host_params{
data_out.data(), dev_alloc_params.getDevPtr(), sizeof(data_in), 1, 1};

size_t array_size = 3;
cu::DeviceMemory mem(dev_alloc_params.getDeviceMemory());
std::vector<const void*> params = {mem.parameter(), &array_size};
cu::Function vector_print_fn(module, "vector_print");
cu::GraphKernelNodeParams kernel_params{
vector_print_fn, 3, 1, 1, 3, 1, 1, 0, params};

graph.addHostToDeviceMemCopyNode(copy_to_dev, {host_set, dev_alloc},
copy_to_dev_params, context);

graph.addKernelNode(execute_kernel, {copy_to_dev}, kernel_params);

graph.addDeviceToHostMemCopyNode(copy_to_host, {execute_kernel},
copy_to_host_params, context);
graph.addDevMemFreeNode(device_free, {copy_to_host},
dev_alloc_params.getDevPtr());

cu::GraphExec graph_exec(graph);
stream.lunchGraph(graph_exec);
stream.synchronize();

CHECK(data_in[0] == 42.0f);
CHECK(data_in[1] == 42.0f);
CHECK(data_in[2] == 42.0f);
CHECK(data_out[0] == 84.0f);
CHECK(data_out[1] == 84.0f);
CHECK(data_out[2] == 84.0f);
}

SECTION("Test cu:graph debug utilities") {
std::array<float, 3> data_in{3, 3, 3};
std::array<float, 3> data_out{0, 0, 0};
std::cout << " Running on device : " << device.getOrdinal() << std::endl;
cu::Graph graph;
cu::GraphNode dev_alloc, host_set, copy_to_dev, execute_kernel, device_free,
copy_to_host;

auto set_value = [](void* data) {
float* ptr = static_cast<float*>(data);
for (int i = 0; i < 3; i++) {
ptr[i] = 42.0f;
}
};
cu::GraphHostNodeParams host_set_params{set_value, data_in.data()};
cu::GraphDevMemAllocNodeParams dev_alloc_params{device, sizeof(data_in)};

graph.addHostNode(host_set, {}, host_set_params);
graph.addDevMemAllocNode(dev_alloc, {}, dev_alloc_params);

cu::GraphMemCopyToDeviceNodeParams copy_to_dev_params{
dev_alloc_params.getDevPtr(), data_in.data(), sizeof(data_in), 1, 1};
cu::GraphMemCopyToHostNodeParams copy_to_host_params{
data_out.data(), dev_alloc_params.getDevPtr(), sizeof(data_in), 1, 1};

size_t array_size = 3;
cu::DeviceMemory mem(dev_alloc_params.getDeviceMemory());
std::vector<const void*> params = {mem.parameter(), &array_size};
cu::Function vector_print_fn(module, "vector_print");
cu::GraphKernelNodeParams kernel_params{
vector_print_fn, 3, 1, 1, 1, 1, 1, 0, params};
auto ctx = context.getCurrent();
graph.addHostToDeviceMemCopyNode(copy_to_dev, {host_set, dev_alloc},
copy_to_dev_params, ctx);

graph.addKernelNode(execute_kernel, {copy_to_dev}, kernel_params);

graph.addDeviceToHostMemCopyNode(copy_to_host, {execute_kernel},
copy_to_host_params, ctx);

graph.exportDotFile("graph.dot");
std::ifstream f("graph.dot");
CHECK(f.good());
f.close();
}
}

0 comments on commit ae7acad

Please sign in to comment.