diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index c601aab..cb6cc85 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -186,7 +186,7 @@ class Device : public Wrapper { } std::string getArch() const { -#if defined(__HIP_PLATFORM_AMD__) +#if defined(__HIP__) hipDeviceProp_t prop; checkCudaCall(hipGetDeviceProperties(&prop, _ordinal)); return prop.gcnArchName; @@ -704,6 +704,94 @@ class GraphKernelNodeParams : public Wrapper { } }; +class GraphHostNodeParams : public Wrapper { + public: + GraphHostNodeParams(void (*fn)(void *), void *data) { + _obj.fn = fn; + _obj.userData = data; + } +}; + +class GraphDevMemAllocNodeParams : public Wrapper { + public: + GraphDevMemAllocNodeParams(const Device &dev, size_t size) { + _obj.bytesize = size; + _obj.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED; + _obj.poolProps.location.id = dev.getOrdinal(); + _obj.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + } + + const CUdeviceptr &getDevPtr() { return _obj.dptr; } + + const void *parameter() { return DeviceMemory(_obj.dptr).parameter(); } + + const DeviceMemory getDeviceMemory() { + return DeviceMemory(_obj.dptr, _obj.bytesize); + } +}; + +class GraphMemCopyToDeviceNodeParams : public Wrapper { + public: + GraphMemCopyToDeviceNodeParams(const DeviceMemory &memory, const void *src, + size_t size_x, size_t size_y, size_t size_z) { + memset(&_obj, 0, sizeof(CUDA_MEMCPY3D)); + _obj.srcMemoryType = CU_MEMORYTYPE_HOST; + _obj.dstMemoryType = CU_MEMORYTYPE_DEVICE; + _obj.srcHost = src; + _obj.dstDevice = static_cast(memory); + _obj.srcXInBytes = 0; + _obj.srcY = 0; + _obj.srcZ = 0; + _obj.dstXInBytes = 0; + _obj.dstY = 0; + _obj.dstZ = 0; + + _obj.WidthInBytes = size_x; + _obj.Height = size_y; + _obj.Depth = size_z; + } + + GraphMemCopyToDeviceNodeParams(const CUdeviceptr &dst, const void *src, + size_t size_x, size_t size_y, size_t size_z) { + memset(&_obj, 0, sizeof(CUDA_MEMCPY3D)); + _obj.srcMemoryType = CU_MEMORYTYPE_HOST; + _obj.dstMemoryType = CU_MEMORYTYPE_DEVICE; + _obj.srcHost = src; + _obj.dstDevice = dst; + _obj.srcXInBytes = 0; + _obj.srcY = 0; + _obj.srcZ = 0; + _obj.dstXInBytes = 0; + _obj.dstY = 0; + _obj.dstZ = 0; + + _obj.WidthInBytes = size_x; + _obj.Height = size_y; + _obj.Depth = size_z; + } +}; + +class GraphMemCopyToHostNodeParams : public Wrapper { + public: + GraphMemCopyToHostNodeParams(void *dst, const CUdeviceptr &src, size_t size_x, + size_t size_y, size_t size_z) { + memset(&_obj, 0, sizeof(CUDA_MEMCPY3D)); + _obj.srcMemoryType = CU_MEMORYTYPE_DEVICE; + _obj.dstMemoryType = CU_MEMORYTYPE_HOST; + _obj.srcDevice = src; + _obj.dstHost = dst; + _obj.srcXInBytes = 0; + _obj.srcY = 0; + _obj.srcZ = 0; + _obj.dstXInBytes = 0; + _obj.dstY = 0; + _obj.dstZ = 0; + _obj.WidthInBytes = size_x; + _obj.Height = size_y; + _obj.Depth = size_z; + } +}; + class Graph : public Wrapper { public: explicit Graph(CUgraph &graph) : Wrapper(graph) {}; @@ -717,10 +805,57 @@ class Graph : public Wrapper { void addKernelNode(GraphNode &node, const std::vector &dependencies, - const GraphKernelNodeParams ¶ms) { - checkCudaCall(cuGraphAddKernelNode(node.getNode(), _obj, - dependencies.data(), dependencies.size(), - (CUDA_KERNEL_NODE_PARAMS *)(¶ms))); + GraphKernelNodeParams ¶ms) { + checkCudaCall(cuGraphAddKernelNode( + node.getNode(), _obj, dependencies.data(), dependencies.size(), + reinterpret_cast(¶ms))); + } + + void addHostNode(GraphNode &node, + const std::vector &dependencies, + GraphHostNodeParams ¶ms) { + checkCudaCall(cuGraphAddHostNode( + node.getNode(), _obj, dependencies.data(), dependencies.size(), + reinterpret_cast(¶ms))); + } + + void addDevMemFreeNode(GraphNode &node, + const std::vector &dependencies, + const CUdeviceptr &devPtr) { + checkCudaCall(cuGraphAddMemFreeNode(node.getNode(), _obj, + dependencies.data(), + dependencies.size(), devPtr)); + } + + void addDevMemAllocNode(GraphNode &node, + const std::vector &dependencies, + GraphDevMemAllocNodeParams ¶ms) { + checkCudaCall(cuGraphAddMemAllocNode( + node.getNode(), _obj, dependencies.data(), dependencies.size(), + reinterpret_cast(¶ms))); + } + + void addHostToDeviceMemCopyNode(GraphNode &node, + const std::vector &dependencies, + GraphMemCopyToDeviceNodeParams ¶ms, + const Context &ctx) { + checkCudaCall(cuGraphAddMemcpyNode( + node.getNode(), _obj, dependencies.data(), dependencies.size(), + reinterpret_cast(¶ms), ctx)); + } + + void addDeviceToHostMemCopyNode(GraphNode &node, + const std::vector &dependencies, + GraphMemCopyToHostNodeParams ¶ms, + const Context &ctx) { + checkCudaCall(cuGraphAddMemcpyNode( + node.getNode(), _obj, dependencies.data(), dependencies.size(), + reinterpret_cast(¶ms), ctx)); + } + + void exportDotFile(std::string path, CUgraphDebugDot_flags flags = + CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE) { + checkCudaCall(cuGraphDebugDotPrint(_obj, path.c_str(), flags)); } CUgraphExec Instantiate(unsigned int flags = CU_GRAPH_DEFAULT) { @@ -741,9 +876,9 @@ class GraphConditionalHandle : public Wrapper { } }; -class WhileNodeParams : public Wrapper { +class GraphWhileNodeParams : public Wrapper { public: - explicit WhileNodeParams(GraphConditionalHandle &conditional) { + explicit GraphWhileNodeParams(GraphConditionalHandle &conditional) { _obj.conditional.type = CU_GRAPH_COND_TYPE_WHILE; _obj.conditional.handle = conditional; _obj.conditional.size = 1; @@ -754,12 +889,11 @@ class WhileNodeParams : public Wrapper { void AddToGraph(Graph &graph, GraphNode &node, const std::vector &dependencies) { - checkCudaCall(cuGraphAddNode((CUgraphNode *)&node, graph, + checkCudaCall(cuGraphAddNode(reinterpret_cast(&node), graph, dependencies.data(), dependencies.size(), &_obj)); } }; - #endif class GraphExec : public Wrapper { diff --git a/include/cudawrappers/macros.hpp b/include/cudawrappers/macros.hpp index 0a1d1f7..ea62308 100644 --- a/include/cudawrappers/macros.hpp +++ b/include/cudawrappers/macros.hpp @@ -94,6 +94,8 @@ typedef uint32_t cuuint32_t; hipGraphInstantiateFlagUseNodePriority #define CUDA_IPC_HANDLE_SIZE HIP_IPC_HANDLE_SIZE #define CUDA_KERNEL_NODE_PARAMS hipKernelNodeParams +#define CUDA_HOST_NODE_PARAMS hipHostNodeParams +#define CUDA_MEM_ALLOC_NODE_PARAMS hipMemAllocNodeParams #define CUDA_R_16BF HIP_R_16BF #define CUDA_R_16F HIP_R_16F #define CUDA_R_32F HIP_R_32F @@ -662,6 +664,7 @@ typedef uint32_t cuuint32_t; #define cuGetErrorString hipDrvGetErrorString #define cuGetProcAddress hipGetProcAddress #define cuGraphAddKernelNode hipGraphAddKernelNode +#define cuGraphAddHostNode hipGraphAddHostNode #define cuGraphCreate hipGraphCreate #define cuGraphDestroy hipGraphDestroy #define cuGraphInstantiateWithFlags hipGraphInstantiateWithFlags diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7d2861b..eae92b3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,7 +10,7 @@ FetchContent_MakeAvailable(Catch2) list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/contrib) include(Catch) -set(COMPONENTS cu nvrtc cufft vector_add) +set(COMPONENTS cu nvrtc cufft vector_add graph) if(${CUDAWRAPPERS_BACKEND_CUDA}) list(APPEND COMPONENTS nvml) endif() @@ -43,3 +43,5 @@ target_link_libraries(test_cufft PUBLIC ${LINK_LIBRARIES} cudawrappers::cufft) target_link_libraries( test_vector_add PUBLIC ${LINK_LIBRARIES} cudawrappers::nvrtc ) + +target_link_libraries(test_graph PUBLIC ${LINK_LIBRARIES} cudawrappers::nvrtc) diff --git a/tests/test_graph.cpp b/tests/test_graph.cpp new file mode 100644 index 0000000..0454dab --- /dev/null +++ b/tests/test_graph.cpp @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +TEST_CASE("Test cu::Graph", "[graph]") { + const std::string kernel = R"( + + + extern "C" __global__ void vector_print(float *a, size_t array_size) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < 3) { + + printf("a[%d] = %f\n", i, a[i]); + a[i] = 42.0f * 2.0f; + } + } + )"; + + cu::init(); + cu::Device device(0); + cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); + nvrtc::Program program(kernel, "kernel.cu"); + program.compile({}); + cu::Module module(static_cast(program.getPTX().data())); + + cu::Stream stream; + + SECTION("Test cu::hostNode single value") { + auto fn = [](void* data) { + int* ptr = static_cast(data); + std::cout << "Host node content: " << *ptr << std::endl; + *ptr += 1; + }; + cu::Graph graph; + int data = 42; + cu::GraphHostNodeParams node_params(fn, &data); + + cu::GraphNode node1, node2; + graph.addHostNode(node1, {}, node_params); + graph.addHostNode(node2, {node1}, node_params); + + cu::GraphExec graph_exec(graph); + cu::Stream stream; + stream.lunchGraph(graph_exec); + stream.synchronize(); + + CHECK(data == 44); + } + SECTION("Test cu::Graph : memory management") { + std::array data_in{3, 3, 3}; + std::array data_out{0, 0, 0}; + std::cout << " Running on device : " << device.getOrdinal() << std::endl; + cu::Graph graph; + cu::GraphNode dev_alloc, host_set, copy_to_dev, execute_kernel, device_free, + copy_to_host; + + auto set_value = [](void* data) { + float* ptr = static_cast(data); + for (int i = 0; i < 3; i++) { + ptr[i] = 42.0f; + } + }; + cu::GraphHostNodeParams host_set_params{set_value, data_in.data()}; + cu::GraphDevMemAllocNodeParams dev_alloc_params{device, sizeof(data_in)}; + + graph.addHostNode(host_set, {}, host_set_params); + graph.addDevMemAllocNode(dev_alloc, {}, dev_alloc_params); + + cu::GraphMemCopyToDeviceNodeParams copy_to_dev_params{ + dev_alloc_params.getDevPtr(), data_in.data(), sizeof(data_in), 1, 1}; + cu::GraphMemCopyToHostNodeParams copy_to_host_params{ + data_out.data(), dev_alloc_params.getDevPtr(), sizeof(data_in), 1, 1}; + + size_t array_size = 3; + cu::DeviceMemory mem(dev_alloc_params.getDeviceMemory()); + std::vector params = {mem.parameter(), &array_size}; + cu::Function vector_print_fn(module, "vector_print"); + cu::GraphKernelNodeParams kernel_params{ + vector_print_fn, 3, 1, 1, 3, 1, 1, 0, params}; + + graph.addHostToDeviceMemCopyNode(copy_to_dev, {host_set, dev_alloc}, + copy_to_dev_params, context); + + graph.addKernelNode(execute_kernel, {copy_to_dev}, kernel_params); + + graph.addDeviceToHostMemCopyNode(copy_to_host, {execute_kernel}, + copy_to_host_params, context); + graph.addDevMemFreeNode(device_free, {copy_to_host}, + dev_alloc_params.getDevPtr()); + + cu::GraphExec graph_exec(graph); + stream.lunchGraph(graph_exec); + stream.synchronize(); + + CHECK(data_in[0] == 42.0f); + CHECK(data_in[1] == 42.0f); + CHECK(data_in[2] == 42.0f); + CHECK(data_out[0] == 84.0f); + CHECK(data_out[1] == 84.0f); + CHECK(data_out[2] == 84.0f); + } + + SECTION("Test cu:graph debug utilities") { + std::array data_in{3, 3, 3}; + std::array data_out{0, 0, 0}; + std::cout << " Running on device : " << device.getOrdinal() << std::endl; + cu::Graph graph; + cu::GraphNode dev_alloc, host_set, copy_to_dev, execute_kernel, device_free, + copy_to_host; + + auto set_value = [](void* data) { + float* ptr = static_cast(data); + for (int i = 0; i < 3; i++) { + ptr[i] = 42.0f; + } + }; + cu::GraphHostNodeParams host_set_params{set_value, data_in.data()}; + cu::GraphDevMemAllocNodeParams dev_alloc_params{device, sizeof(data_in)}; + + graph.addHostNode(host_set, {}, host_set_params); + graph.addDevMemAllocNode(dev_alloc, {}, dev_alloc_params); + + cu::GraphMemCopyToDeviceNodeParams copy_to_dev_params{ + dev_alloc_params.getDevPtr(), data_in.data(), sizeof(data_in), 1, 1}; + cu::GraphMemCopyToHostNodeParams copy_to_host_params{ + data_out.data(), dev_alloc_params.getDevPtr(), sizeof(data_in), 1, 1}; + + size_t array_size = 3; + cu::DeviceMemory mem(dev_alloc_params.getDeviceMemory()); + std::vector params = {mem.parameter(), &array_size}; + cu::Function vector_print_fn(module, "vector_print"); + cu::GraphKernelNodeParams kernel_params{ + vector_print_fn, 3, 1, 1, 1, 1, 1, 0, params}; + auto ctx = context.getCurrent(); + graph.addHostToDeviceMemCopyNode(copy_to_dev, {host_set, dev_alloc}, + copy_to_dev_params, ctx); + + graph.addKernelNode(execute_kernel, {copy_to_dev}, kernel_params); + + graph.addDeviceToHostMemCopyNode(copy_to_host, {execute_kernel}, + copy_to_host_params, ctx); + + graph.exportDotFile("graph.dot"); + std::ifstream f("graph.dot"); + CHECK(f.good()); + f.close(); + } +} \ No newline at end of file