hip
hipcc main.cpp -fopenmp -lrocblas --std=c++14 && ./a.out
cuda
nvcc main.cpp -lcublas && ./a.out
-
在 Vega 10 XTX [Radeon Vega Frontier Edition] 中有 rocblas 80% 的性能, 是峰值性能的50%
-
在 MI100 中大概只有rocblas 30% 的性能(rocblas的性能不正常, 怀疑使用了matrix core), 是峰值性能的50%
-
cuda 部分 属于 hip 的代码转过去的, 并未对性能作特殊优化
-
综合最快的kernel函数为7.4
-
下面的数据均为M = 4096, N = 4096, K = 4096 的结果
-
数据所用硬件为MI100, SCLK 1472Mhz, MCLK 1200Mhz
-
speedup: 针对上一版 kernel 的加速比
-
rocblas_ratio: 和rocblas执行时间的比例
-
Tflops: Tflops
-
Tflops_ratio: 和MI100峰值flops的比例
-
如何用理论解释优化结果
-
如何用rocprof 分析出性能瓶颈
-
cuda gemm的优化方法大部分都已经试过了, 有无效果和负效果的, 是自己问题, 还是 hip 本身不支持
-
hip gemm的代码大部分都是用汇编写的, 是否意味着 hipcc 的优化差
-
共享内存汇编指令, 存在着部分数据无法读取的问题, 如何解决
-
性能未达预期, 如何继续优化?
-
为什么rocblas的Tflops 值比显卡的峰值性能还高, 但低于matrix core的峰值性能, 是使用了 matrix core 没说, 还是我算错了?
-
tensile和cuda gemm 都使用了 M = 128, N = 128, K = 8 的矩阵分块, 无论我如何尝试都无法超过 M = 64, N = 64, K = 16 的分块, 是我的实现方式有问题?
test1: 387.927ms
__global__ void gemm_kernel1(float *d_A, float *d_B, float *d_C, int M, int N, int K) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int ix = idx % N; // n
int iy = idx / N; // m
// printf("ix: %d \n", ix);
// printf("iy: %d \n", iy);
if (idx >= M * N) return;
float total = 0;
for (int i = 0; i < K; i++) {
total += d_A[iy * K + i] * d_B[i * N + ix];
}
// printf("total: %lf \n", total);
d_C[iy * N + ix] = total;
}
test2: 50.1172ms speedup: 0.0892845, rocblas_ratio: 0.0892845, Tflops: 2.74235, Tflops_ratio: 0.118717
__global__ void gemm_kernel2(float *d_A, float *d_B, float *d_C, int M, int N, int K, int m, int n, int k) {
extern __shared__ float sh[];
float *A_sh = sh; //
float *B_sh = sh + m * k;
int N_tile_index = blockIdx.x % ((N + n - 1)/ n); // tile的列号
int M_tile_index = blockIdx.x / ((N + n - 1)/ n); // tile的行号
int n_index = threadIdx.x % (n); // tile内的4 * 4列号
int m_index = threadIdx.x / (n); // tile内的4 * 4行号
float total = 0.0f;
for (int K_tile_index = 0; K_tile_index < K; K_tile_index += k) {
// 共享内存读取数据
if ((M_tile_index * m + m_index) * K + K_tile_index + n_index < M * K) {
A_sh[m_index * k + n_index] = d_A[(M_tile_index * m + m_index) * K + K_tile_index + n_index];
} else {
A_sh[m_index * k + n_index] = 0;
}
if ((K_tile_index + m_index) * N + N_tile_index * n + n_index < K * N) {
B_sh[m_index * n + n_index] = d_B[(K_tile_index + m_index) * N + N_tile_index * n + n_index];
} else {
B_sh[m_index * n + n_index] = 0;
}
__syncthreads();
// 一个线程计算一个输出元素
for (int k_index = 0; k_index < k; k_index++) {
total += A_sh[m_index * k + k_index] * B_sh[k_index * n + n_index];
}
__syncthreads();
}
d_C[(M_tile_index * m + m_index) * N + N_tile_index * n + n_index] = total;
}
test3: 25.2284ms speedup: 1.98654, rocblas_ratio: 0.177367, Tflops: 5.44778, Tflops_ratio: 0.235835
__global__ void gemm_kernel3(float *d_A, float *d_B, float *d_C, int M, int N, int K, int m, int n, int k) {
const int reg_size = 4;
extern __shared__ float sh[];
float *A_sh = sh;
float *B_sh = sh + m * k;
int N_tile_index = blockIdx.x; // tile的列号
int M_tile_index = blockIdx.y; // tile的行号
int n_index = threadIdx.x % ((n + reg_size - 1) / reg_size); // tile内的4 * 4列号
int m_index = threadIdx.x / ((n + reg_size - 1) / reg_size); // tile内的4 * 4行号
float reg_C[reg_size][reg_size] = {0.0f};
// float total = 0.0f;
for (int K_tile_index = 0; K_tile_index < int((K + k - 1) / k); K_tile_index++) {
// 计算一个线程应该从全局内存读取多个元素
int pre_thread_num = (m * k + blockDim.x - 1)/ blockDim.x;
for (int i = 0; i < pre_thread_num; i++) {
int ix = threadIdx.x * pre_thread_num + i;
int n_index = ix % k;
int m_index = ix / k;
if ((M_tile_index * m + m_index) * K + K_tile_index * k + n_index < M * K) {
A_sh[ix] = d_A[(M_tile_index * m + m_index) * K + K_tile_index * k + n_index];
} else {
A_sh[ix] = 0;
}
}
pre_thread_num = (k * n + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < pre_thread_num; i++) {
int ix = threadIdx.x * pre_thread_num + i;
int n_index = ix % n;
int m_index = ix / n;
if ((K_tile_index * k + m_index) * N + N_tile_index * n + n_index < K * N) {
B_sh[ix] = d_B[(K_tile_index * k + m_index) * N + N_tile_index * n + n_index];
} else {
B_sh[ix] = 0;
}
}
__syncthreads();
for (int k_reg_index = 0; k_reg_index < k; k_reg_index+= reg_size) {
for (int i = 0; i < reg_size; i++) {
for (int j = 0; j < reg_size; j++) {
for (int k_index = 0; k_index < reg_size; k_index++) {
reg_C[i][j] += A_sh[m_index * reg_size * k + k_reg_index + i * k + k_index] * B_sh[k_reg_index * n + n_index * reg_size + k_index * n + j];
}
}
}
}
__syncthreads();
}
for (int i = 0; i < reg_size; i++) {
for (int j = 0; j < reg_size; j++) {
int C_index = (M_tile_index * m + m_index * reg_size) * N + N_tile_index * n + n_index * reg_size + i * N + j;
if (C_index < M * N) {
// printf("C_index: %d \n", C_index);
d_C[C_index] = reg_C[i][j];
}
}
}
}
test3_1: 25.5791ms speedup: 0.986289, rocblas_ratio: 0.174935, Tflops: 5.37309, Tflops_ratio: 0.232601
test3_2: 17.7575ms speedup: 1.44047, rocblas_ratio: 0.251988, Tflops: 7.73976, Tflops_ratio: 0.335055
test3_3: 17.6317ms speedup: 1.00714, rocblas_ratio: 0.253787, Tflops: 7.79499, Tflops_ratio: 0.337446
test3_4: 17.9752ms speedup: 0.980892, rocblas_ratio: 0.248937, Tflops: 7.64604, Tflops_ratio: 0.330998
test3_5: 17.7971ms speedup: 1.01, rocblas_ratio: 0.251427, Tflops: 7.72253, Tflops_ratio: 0.334309
test3_6: 22.8915ms speedup: 0.777455, rocblas_ratio: 0.195474, Tflops: 6.00392, Tflops_ratio: 0.25991
test4: 14.3921ms speedup: 1.59056, rocblas_ratio: 0.310912, Tflops: 9.5496, Tflops_ratio: 0.413402
test5: 13.553ms speedup: 1.06192, rocblas_ratio: 0.330163, Tflops: 10.1409, Tflops_ratio: 0.438999
test6: 164.493ms speedup: 0.0823921, rocblas_ratio: 0.0272028, Tflops: 0.835529, Tflops_ratio: 0.036170
test7: 12.9557ms speedup: 12.6966, rocblas_ratio: 0.345385, Tflops: 10.6084, Tflops_ratio: 0.459238
test7_1: 14.2125ms speedup: 0.91157, rocblas_ratio: 0.314842, Tflops: 9.6703, Tflops_ratio: 0.418628
test7_2: 14.1483ms speedup: 1.00454, rocblas_ratio: 0.316271, Tflops: 9.71418, Tflops_ratio: 0.420527
test7_3: 11.992ms speedup: 1.17981, rocblas_ratio: 0.373138, Tflops: 11.4608, Tflops_ratio: 0.49614
test7_4: 11.1994ms speedup: 1.07077, rocblas_ratio: 0.399545, Tflops: 12.2719, Tflops_ratio: 0.531253
test7_5: 12.0331ms speedup: 0.930723, rocblas_ratio: 0.371866, Tflops: 11.4218, Tflops_ratio: 0.494449
test7_6: 13.1542ms speedup: 0.914768, rocblas_ratio: 0.340171, Tflops: 10.4483, Tflops_ratio: 0.452307
test7_7: 15.514ms speedup: 0.847894, rocblas_ratio: 0.288429, Tflops: 8.85904, Tflops_ratio: 0.383508
test7_8: 11.2175ms speedup: 1.38302, rocblas_ratio: 0.398904, Tflops: 12.2522, Tflops_ratio: 0.5304
test8_1: 14.2521ms speedup: 1.00704, rocblas_ratio: 0.313967, Tflops: 9.64343, Tflops_ratio: 0.417465
test8_2: 13.474ms speedup: 1.05774, rocblas_ratio: 0.332097, Tflops: 10.2003, Tflops_ratio: 0.441571
test8_3: 22.2234ms speedup: 0.606299, rocblas_ratio: 0.20135, Tflops: 6.18442, Tflops_ratio: 0.267724
test8_4: 13.0805ms speedup: 1.69897, rocblas_ratio: 0.342088, Tflops: 10.5072, Tflops_ratio: 0.454855
test9: 14.2987ms speedup: 1.58132, rocblas_ratio: 0.312943, Tflops: 9.61198, Tflops_ratio: 0.416103
test9_1: 16.2988ms speedup: 1.05635, rocblas_ratio: 0.274541, Tflops: 8.43245, Tflops_ratio: 0.365041
test10: 186.349ms speedup: 0.0874637, rocblas_ratio: 0.0240123, Tflops: 0.737533, Tflops_ratio: 0.031927
-
全局内存的读取和写入其实都是异步的. 但是如果只用 hip 全部会变成同步的指令
举例:
global_load<0>(ptr, register);
参数1是地址, 参数2是register
register 要求是 Float4 不同于 float4, 可参考[2]
注意不要直接全局内存写入到共享内存, 要用寄存器做传递
-
全局内存的同步
该指令表示等待所有的全局内存读取指令完成, 再继续执行, 可以作为同步指令(注意: 不同访存指令, 乱序发射)
vmcnt<0>();
-
共享内存的同步
该指令表示等待共享内存的读取
lgkmcnt<0>();
-
共享内存的读取指令有点问题, 后面搞清楚补充
[1] HIP-Performance-Optmization-on-VEGA64: hip 性能分析
[2] 全局内存读取的解释
[3] 内联汇编语言的使用方法
[4] hip gemm的编写方法
[6] gemm 优化 也用的汇编
[7] CUDA SGEMM矩阵乘法优化笔记——从入门到cublas