forked from Rmalavally/rocm-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.hip
204 lines (165 loc) · 7.58 KB
/
main.hip
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// MIT License
//
// Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
/// \brief Kernel function to perform matrix transpose using shared memory of constant size
template<unsigned int Width>
__global__ void matrix_transpose_static_shared(float* out, const float* in)
{
// Shared memory of constant size
__shared__ float shared_mem[Width * Width];
// Get the two-dimensional global thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
// Perform matrix transpose in shared memroy
shared_mem[y * Width + x] = in[x * Width + y];
// Synchronize all threads within a thread block
__syncthreads();
// Write the matrix transpose into the global memory
out[y * Width + x] = shared_mem[y * Width + x];
}
/// \brief Kernel function to perform matrix transpose using dynamic shared memory
__global__ void matrix_transpose_dynamic_shared(float* out, const float* in, const int width)
{
// Dynamic shared memory
extern __shared__ float shared_mem[];
// Get the two-dimensional global thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
// Perform matrix transpose in shared memroy
shared_mem[y * width + x] = in[x * width + y];
// Synchronize all threads within a thread block
__syncthreads();
// Write the matrix transpose into the global memroy
out[y * width + x] = shared_mem[y * width + x];
}
template<unsigned int Width, unsigned int Size>
void deploy_multiple_stream(const float* h_in,
std::vector<float*>& h_transpose_matrix,
const int num_streams)
{
// Set the block dimensions
constexpr unsigned int threads_per_block_x = 4;
constexpr unsigned int threads_per_block_y = 4;
// Create streams
std::vector<hipStream_t> streams(num_streams);
for(int i = 0; i < num_streams; i++)
{
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Allocate device input and output memory and copy host input data to device memory
std::vector<float*> d_in(num_streams);
std::vector<float*> d_transpose_matrix(num_streams);
// Size in bytes for memory management
const size_t size_in_bytes = sizeof(float) * Size;
// Allocate device input memory
HIP_CHECK(hipMalloc(&d_in[0], size_in_bytes));
HIP_CHECK(hipMalloc(&d_in[1], size_in_bytes));
// Allocate device output memory
HIP_CHECK(hipMalloc(&d_transpose_matrix[0], size_in_bytes));
HIP_CHECK(hipMalloc(&d_transpose_matrix[1], size_in_bytes));
for(int i = 0; i < num_streams; i++)
{
// hipMemcpyAsync is used without needing to sync before the kernel launch
// Because both the hipMemcpyAsync and the kernel launch reside in the same stream.
// The kernel will be executed only after hipMemcpyAsync finishes. There is implicit synchronization.
// Note: If the host memory is not pinned at allocation time using hipHostMalloc then hipMemcpyAsync
// will behave as synchronous.
HIP_CHECK(hipMemcpyAsync(d_in[i], h_in, size_in_bytes, hipMemcpyHostToDevice, streams[i]));
}
// Make sure that Width is evenly divisible by threads_per_block_x and threads_per_block_y
static_assert(Width % threads_per_block_x == 0);
static_assert(Width % threads_per_block_y == 0);
// Launch kernel with stream[0]
matrix_transpose_static_shared<Width>
<<<dim3(Width / threads_per_block_x, Width / threads_per_block_y),
dim3(threads_per_block_x, threads_per_block_y),
0,
streams[0]>>>(d_transpose_matrix[0], d_in[0]);
// Launch kernel with stream[1]
matrix_transpose_dynamic_shared<<<dim3(Width / threads_per_block_x,
Width / threads_per_block_y),
dim3(threads_per_block_x, threads_per_block_y),
sizeof(float) * Width * Width,
streams[1]>>>(d_transpose_matrix[1], d_in[1], Width);
// Asynchronously copy the results from device to host
for(int i = 0; i < num_streams; i++)
{
HIP_CHECK(hipMemcpyAsync(h_transpose_matrix[i],
d_transpose_matrix[i],
size_in_bytes,
hipMemcpyDeviceToHost,
streams[i]));
}
// Wait for all tasks in both the streams to complete on the device
HIP_CHECK(hipDeviceSynchronize());
// Destroy the streams
for(int i = 0; i < num_streams; i++)
{
HIP_CHECK(hipStreamDestroy(streams[i]))
}
// Free device memory
for(int i = 0; i < num_streams; i++)
{
HIP_CHECK(hipFree(d_in[i]));
HIP_CHECK(hipFree(d_transpose_matrix[i]));
}
}
int main()
{
// Dimension of the input square matrix is width x width
constexpr unsigned int width = 32;
constexpr unsigned int size = width * width;
// Number of streams to be used. It is hardcoded to 2 as this example demonstrates
// only two kernel launches and their management.
constexpr unsigned int num_streams = 2;
// Size in bytes for memory management
const size_t size_in_bytes = sizeof(float) * size;
// Allocate host input and output memory as pinned memory using hipHostMalloc.
// It will ensure that the memory copies will be performed
// asynchronously when using hipMemcpyAsync
// Host input memory
float* h_in = nullptr;
HIP_CHECK(hipHostMalloc(&h_in, size_in_bytes));
// Here we use two streams therefore declare two separate output storage structures
// one for each stream.
// Host output memory
std::vector<float*> h_transpose_matrix(num_streams);
// Allocate host output memory
HIP_CHECK(hipHostMalloc(&h_transpose_matrix[0], size_in_bytes));
HIP_CHECK(hipHostMalloc(&h_transpose_matrix[1], size_in_bytes));
// Initialize the host input matrix
for(unsigned int i = 0; i < size; i++)
{
h_in[i] = static_cast<float>(i);
}
deploy_multiple_stream<width, size>(h_in, h_transpose_matrix, num_streams);
// Free host memory
HIP_CHECK(hipHostFree(h_in));
for(unsigned int i = 0; i < num_streams; i++)
{
HIP_CHECK(hipHostFree(h_transpose_matrix[i]));
}
std::cout << "streams completed!" << std::endl;
}