From d4242ad0385558756ca20e395f152e968af7676a Mon Sep 17 00:00:00 2001 From: Duksu Kim Date: Tue, 13 Dec 2022 15:53:52 +0900 Subject: [PATCH] Add 10-6 --- .../10_6_MatMul_SharedMem/DS_definitions.h | 197 ++++++++++ .../Chap10/10_6_MatMul_SharedMem/DS_timer.cpp | 336 ++++++++++++++++++ .../Chap10/10_6_MatMul_SharedMem/DS_timer.h | 95 +++++ .../10_6_MatMul_SharedMem/MatMul_sharedMem.cu | 248 +++++++++++++ CUDA_Programming.sln | 9 + .../MatMul_sharedMem/MatMul_sharedMem.cu | 10 +- .../10_6_MatMul_SharedMem.vcxproj | 93 +++++ 7 files changed, 983 insertions(+), 5 deletions(-) create mode 100644 Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_definitions.h create mode 100644 Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.cpp create mode 100644 Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.h create mode 100644 Book_BJ/Chap10/10_6_MatMul_SharedMem/MatMul_sharedMem.cu create mode 100644 VS2019/Book_BJ/Chap10/10_6_MatMul_SharedMem/10_6_MatMul_SharedMem.vcxproj diff --git a/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_definitions.h b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_definitions.h new file mode 100644 index 0000000..e7a6816 --- /dev/null +++ b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_definitions.h @@ -0,0 +1,197 @@ +#pragma once + +#include +#include +#include +#include + +#define OS_WINDOWS 0 +#define OS_LINUX 1 + +#ifdef _WIN32 +#define _TARGET_OS OS_WINDOWS +#else + #ifndef nullptr + #define nullptr NULL + #endif + #define _TARGET_OS OS_LINUX +#endif + +/************************************************************************/ +/* OS dependet function */ +/************************************************************************/ +#if _TARGET_OS == OS_WINDOWS +// #define _SPRINT sprintf_s +#define _STRTOK strtok_s + +#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);} + +#define SPLIT_PATH(_path,_result) \ + _splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255) + + +#elif _TARGET_OS == OS_LINUX +#include +#include + +#define _STRTOK strtok_r + +#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);} + +#define sprintf_s sprintf +#define scanf_s scanf +#define fprintf_s fprintf + +#define __int64 int64_t + +#define fopen_s(fp, name, mode) (*fp = fopen(name, mode)) + +#endif + +/************************************************************************/ +/* Defines */ +/************************************************************************/ + +// *********** data size +#define _1K_ 1024 +#define _1M_ (_1K_*_1K_) +#define _1G_ (_1M_*_1K_) + +#define CHAR_STRING_SIZE 255 + +/************************************************************************/ +/* Type definitions */ +/************************************************************************/ +typedef unsigned int UINT ; + +/************************************************************************/ +/* Macro functions */ +/************************************************************************/ +#define DS_MEM_DELETE(a) \ + if (a != NULL) { \ + delete a ; \ + a = NULL ; \ + } + +#define DS_MEM_DELETE_ARRAY(a) \ + if (a != NULL) { \ + delete [] a ; \ + a = NULL ; \ + } + +#define RANGE_MIN 0 +#define RANGE_MAX 1 + +#define MATCHED_STRING 0 + +#ifndef VTK_RANGE_MIN +#define VTK_RANGE_MIN 0 +#define VTK_RANGE_MAX 1 +#endif + +// Print +#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__) +#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);} + +// Single loops +#define LOOP_I(a) for(int i=0; i +void SWAP(T &a, T &b){ + T tmp = a; + a = b; + b = tmp; +} +#endif + +// +#ifndef MIN +#define MIN(a,b) (a > b ? b : a) +#endif + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +// Index converter + +#define INDEX2X(_ID,_W) (_ID%_W) +#define INDEX2Y(_ID,_W) (_ID/_W) +#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);} +#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X) +#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base) + +// Memory allocation and release +#ifndef SAFE_DELETE +#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;} +#endif + +#ifndef SAFE_DELETE_ARR +#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;} +#endif + +#define SAFE_NEW(p, type, size) {\ + try {p = new type[size];} \ + catch(std::bad_alloc& exc) \ + { printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \ + EXIT_WIHT_KEYPRESS }\ + } + +template +void memsetZero(T** p, long long size = 0) { + if (*p != NULL) + memset(*p, 0, sizeof(T)*size); +} + +template +void allocNinitMem(T** p, long long size, double *memUsage = NULL) { + *p = new T[size]; + //SAFE_NEW(*p, T, size); + memset(*p, 0, sizeof(T)*size); + + if (memUsage != NULL) { + *memUsage += sizeof(T)*size; + } +} + +#define SAFE_MEMCPY(_dst, _src, _type, _size){ \ + if(_dst == nullptr || _src == nullptr ) \ + printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \ + exit(-1); \ + memcpy(_dst, _src, sizeof(_type)*_size);\ +} + +// VTK related +#ifndef SAFE_DELETE_VTK +#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;} +#endif + +#ifndef VTK_IS_NOERROR +//#include "DS_common_def.h" +#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false) +#endif + +/************************************************************************/ +/* Data structures */ +/************************************************************************/ +typedef struct { + std::string input; + std::string output; +} nameMatch; + +typedef struct { + char drive[255]; + char dir[255]; + char filename[255]; + char ext[255]; +} filePathSplit; diff --git a/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.cpp b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.cpp new file mode 100644 index 0000000..876a2c0 --- /dev/null +++ b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.cpp @@ -0,0 +1,336 @@ +#include "DS_timer.h" +#include "DS_definitions.h" +#include + +//#ifndef _WIN32 +//#define fprintf_s fprintf +//void fopen_s(FILE** fp, char* name, char* mode) +//{ +// *fp = fopen(name, mode); +//} +//#endif + +/************************************************************************/ +/* Constructor & Destructor */ +/************************************************************************/ +DS_timer::DS_timer(int _numTimer /* =0 */, int _numCount /* =0 */, bool _trunOn /* =true */){ + + turnOn = _trunOn ; + start_ticks = NULL ; end_ticks = NULL ; totalTicks = NULL ; + counters = NULL ; + timerStates = NULL ; + timerName = NULL ; + + numTimer = numCounter = 0 ; + + setTimerTitle((char*)"DS_timer Report") ; + + setTimer(_numTimer ) ; + setCounter( _numTimer ) ; + +#ifdef _WIN32 + // For windows + QueryPerformanceFrequency(&ticksPerSecond) ; +#endif +} + +DS_timer::~DS_timer(void){ + releaseTimers() ; + releaseCounters() ; +} + +/************************************************************************/ +/* Set & Get configurations */ +/************************************************************************/ + +// Memory allocation +void DS_timer::memAllocCounters( void ) { + releaseCounters() ; + counters = new UINT[numCounter] ; + initCounters() ; +} + +void DS_timer::memAllocTimers( void ) { + releaseTimers() ; + + timerStates = new bool[numTimer] ; + start_ticks = new TIME_VAL[numTimer]; + end_ticks = new TIME_VAL[numTimer]; + totalTicks = new TIME_VAL[numTimer]; + + // Initialize + memset(timerStates, 0, sizeof(bool)*numTimer); + memset(start_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(end_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(totalTicks, 0, sizeof(TIME_VAL)*numTimer); + + timerName = new std::string[numTimer] ; + for ( UINT i = 0 ; i < numTimer ; i++ ) { + timerName[i].clear() ; + timerName[i].resize(255) ; + } +} + +// Memory release +void DS_timer::releaseCounters( void ) { DS_MEM_DELETE_ARRAY(counters) ; } +void DS_timer::releaseTimers( void ) { + //DS_MEM_DELETE(start_ticks) ; DS_MEM_DELETE(end_ticks) ; DS_MEM_DELETE(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerStates) ; + DS_MEM_DELETE_ARRAY(start_ticks) ; + DS_MEM_DELETE_ARRAY(end_ticks) ; + DS_MEM_DELETE_ARRAY(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerName) ; +} + +// Getters +UINT DS_timer::getNumTimer( void ) { return numTimer ; } +UINT DS_timer::getNumCounter( void ) { return numCounter ; } + +// Setters +UINT DS_timer::setTimer( UINT _numTimer ) { + if ( _numTimer == 0 ) + return 0 ; + + if (_numTimer <= numTimer) + return numTimer; + + if ( numTimer != 0 ) { + + // Backup + UINT oldNumTimer = numTimer ; + TIME_VAL *oldTotalTicks = new TIME_VAL[oldNumTimer]; + memcpy(oldTotalTicks, totalTicks, sizeof(TIME_VAL)*oldNumTimer); + + numTimer = _numTimer ; + memAllocTimers() ; + + memcpy(totalTicks, oldTotalTicks, sizeof(TIME_VAL)* oldNumTimer); + delete oldTotalTicks ; + } else { + numTimer = _numTimer ; + memAllocTimers() ; + } + + return _numTimer ; +} + +UINT DS_timer::setCounter( UINT _numCounter ) { + + if (_numCounter == 0 ) + return 0 ; + + if (_numCounter <= numCounter) + return numCounter; + + if ( numCounter != 0 ) { + + // Backup + int numOldCounter = numCounter ; + UINT *oldCounters = new UINT[numOldCounter] ; + memcpy(oldCounters, counters, sizeof(UINT)*numOldCounter) ; + + numCounter = _numCounter ; + memAllocCounters() ; + + // Restore + memcpy(counters, oldCounters, sizeof(UINT)*numOldCounter) ; + delete oldCounters ; + + } else { + numCounter = _numCounter ; + memAllocCounters() ; + } + + return numCounter ; + +} + +/************************************************************************/ +/* Timer */ +/************************************************************************/ +void DS_timer::initTimer( UINT id ) { + timerStates[id] = TIMER_OFF ; +#ifdef _WIN32 + totalTicks[id].QuadPart = 0 ; +#else + totalTicks[id].tv_sec = 0; + totalTicks[id].tv_usec = 0; +#endif +} + +void DS_timer::initTimers( void ) { + for ( UINT i = 0 ; i < numTimer ; i++ ) { + initTimer(i); + } +} + +void DS_timer::onTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_ON ) + return ; +#ifdef _WIN32 + QueryPerformanceCounter(&start_ticks[id]) ; +#else + gettimeofday(&start_ticks[id], NULL); +#endif + + timerStates[id] = TIMER_ON ; +} + +void DS_timer::offTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_OFF ) + return ; + +#ifdef _WIN32 + QueryPerformanceCounter(&end_ticks[id]) ; + totalTicks[id].QuadPart = totalTicks[id].QuadPart + (end_ticks[id].QuadPart - start_ticks[id].QuadPart) ; +#else + gettimeofday(&end_ticks[id], NULL); + TIME_VAL period, previous; + timersub(&end_ticks[id], &start_ticks[id], &period); + previous = totalTicks[id]; + timeradd(&previous, &period, &totalTicks[id]); +#endif + + timerStates[id] = TIMER_OFF ; +} + +double DS_timer::getTimer_ms( UINT id ) { +#ifdef _WIN32 + return ((double)totalTicks[id].QuadPart/(double)ticksPerSecond.QuadPart * 1000) ; +#else + return (double)(totalTicks[id].tv_sec * 1000 + totalTicks[id].tv_usec / 1000.0); +#endif +} + +/************************************************************************/ +/* Counter */ +/************************************************************************/ +void DS_timer::incCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id]++ ; +} + +void DS_timer::initCounters( void ) { + if ( turnOn == false ) + return ; + + for ( UINT i = 0 ; i < numCounter ; i++ ) + counters[i] = 0 ; +} + +void DS_timer::initCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id] = 0 ; +} + +void DS_timer::add2Counter( UINT id, UINT num ) { + if ( turnOn == false ) + return ; + + counters[id] += num ; +} + +UINT DS_timer::getCounter( UINT id ) { + if ( turnOn == false ) + return 0; + + return counters[id] ; +} + +/************************************************************************/ +/* */ +/************************************************************************/ +void DS_timer::printTimer( float _denominator){ + + if ( turnOn == false ) + return ; + + //printf("\n*\t DS_timer Report \t*\n") ; + printf("\n*\t %s \t*\n", timerTitle) ; + printf("* The number of timer = %d, counter = %d\n", numTimer, numCounter ) ; + printf("**** Timer report ****\n") ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + if ( timerName[i].c_str()[0] == 0 ) + printf("Timer %d : %.5f ms\n", i, getTimer_ms(i) ) ; + else + printf("%s : %.5f ms\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + printf("**** Counter report ****\n") ; + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + printf("Counter %d : %.3f (%d) \n",i, counters[i]/_denominator, counters[i] ) ; + } + + printf("*\t End of the report \t*\n") ; +} + +void DS_timer::printToFile( char* fileName, int _id ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_report.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + if ( _id >= 0 ) + fprintf_s(fp,"%d\t", _id) ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + fprintf_s(fp, "%s: %.9f\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + fprintf_s(fp, "%s: %d\n", timerName[i].c_str(), counters[i] ) ; + } + + fprintf_s(fp, "\n") ; + + fclose(fp) ; +} + +void DS_timer::printTimerNameToFile( char* fileName ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_name.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( timerName[i].empty() ) + continue ; + fprintf_s(fp, "%s\t", timerName[i].c_str() ) ; + } + fprintf_s(fp, "\n") ; + fclose(fp) ; +} diff --git a/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.h b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.h new file mode 100644 index 0000000..d1c3391 --- /dev/null +++ b/Book_BJ/Chap10/10_6_MatMul_SharedMem/DS_timer.h @@ -0,0 +1,95 @@ +//#pragma once +#ifndef _DS_TIMER_H +#define _DS_TIMER_H + +#include // std string + +#ifndef UINT +typedef unsigned int UINT; +#endif + +#ifdef _WIN32 + // For windows + #include + typedef LARGE_INTEGER TIME_VAL; +#else + // For Unix/Linux + #include + #include + #include + #include // c string + typedef struct timeval TIME_VAL; +#endif + +#define TIMER_ON true +#define TIMER_OFF false + +class DS_timer +{ +private : + + bool turnOn ; + + UINT numTimer ; + UINT numCounter ; + + // For timers + bool* timerStates ; + TIME_VAL ticksPerSecond; + TIME_VAL *start_ticks; + TIME_VAL *end_ticks; + TIME_VAL *totalTicks; + + char timerTitle[255] ; + std::string *timerName ; + + // For counters + UINT *counters ; + + void memAllocCounters ( void ) ; + void memAllocTimers ( void ) ; + void releaseCounters ( void ) ; + void releaseTimers ( void ) ; + +public: + DS_timer(int _numTimer = 1, int _numCount = 1, bool _trunOn = true ); + ~DS_timer(void); + + // For configurations + inline void timerOn ( void ) { turnOn = TIMER_ON ; } + inline void timerOff ( void ) { turnOn = TIMER_OFF ; } + + UINT getNumTimer( void ) ; + UINT getNumCounter ( void ) ; + UINT setTimer ( UINT _numTimer ) ; + UINT setCounter ( UINT _numCounter ) ; + + // For timers + + void initTimer(UINT id) ; + void initTimers ( void ); + void onTimer(UINT id) ; + void offTimer(UINT id ) ; + double getTimer_ms(UINT id) ; + + void setTimerTitle ( char* _name ) { memset(timerTitle, 0, sizeof(char)*255) ; memcpy(timerTitle, _name, strlen(_name)) ; } + + void setTimerName (UINT id, std::string &_name) { timerName[id] = _name ; } + void setTimerName (UINT id, char* _name) { timerName[id] = _name ;} + + // For counters + + void incCounter(UINT id) ; + void initCounters( void ) ; + void initCounter(UINT id) ; + void add2Counter( UINT id, UINT num ) ; + UINT getCounter ( UINT id ) ; + + // For reports + + void printTimer ( float _denominator = 1 ) ; + void printToFile ( char* fileName, int _id = -1 ) ; + void printTimerNameToFile ( char* fileName ) ; +} ; + +#endif \ No newline at end of file diff --git a/Book_BJ/Chap10/10_6_MatMul_SharedMem/MatMul_sharedMem.cu b/Book_BJ/Chap10/10_6_MatMul_SharedMem/MatMul_sharedMem.cu new file mode 100644 index 0000000..e2d1530 --- /dev/null +++ b/Book_BJ/Chap10/10_6_MatMul_SharedMem/MatMul_sharedMem.cu @@ -0,0 +1,248 @@ +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include "DS_timer.h" +#include +#include +#include + +#define DATA_TYPE int + +#define SIZE_M (512*2) +#define SIZE_N (512*4) +#define SIZE_K (512*2) + +#define INDEX2ROW(_index,_width) (int)((_index)/(_width)) +#define INDEX2COL(_index,_width) ((_index)%(_width)) +#define ID2INDEX(_row,_col, _width) (((_row)*(_width))+(_col)) + +#define BLOCK_SIZE 16 + +// macro function +#define IS_EQUAL(_a, _b) (abs(_b - _a) < 10e-6) + +/****************************************************************** +* Modify this kernel to use shared memory +******************************************************************/ +__global__ void MatMul_SharedMem(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = blockDim.y * blockIdx.y + threadIdx.y; + + DATA_TYPE val = 0; + __shared__ DATA_TYPE subA[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ DATA_TYPE subB[BLOCK_SIZE][BLOCK_SIZE]; + + int localRow = threadIdx.x; + int localCol = threadIdx.y; + + for (int bID = 0; bID < ceil((float)k / BLOCK_SIZE); bID++) { + int offset = bID * BLOCK_SIZE; + + // load A and B + if (row >= m || offset + localCol >= k) + subA[localRow][localCol] = 0; + else + subA[localRow][localCol] = matA[row * k + (offset + localCol)]; + + if (col >= n || offset + localRow >= k) + subB[localRow][localCol] = 0; + else + subB[localRow][localCol] = matB[(offset + localRow) * n + col]; + + __syncthreads(); + + // compute + for (int i = 0; i < BLOCK_SIZE; i++) { + val += subA[localRow][i] * subB[i][localCol]; + } + __syncthreads(); + } + + if (row >= m || col >= n) + return; + + matC[row * n + col] = val; +} +/****************************************************************** +******************************************************************/ + +template void allocNinitMem(T** p, long long size, DATA_TYPE* memUsage = NULL); +void runMatMul_Basic(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k); +bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size); + +DS_timer timer(10); +void setTimer(); + +int main(int argc, char* argv[]) +{ + setTimer(); + + // set matrix size + int m, n, k; + m = SIZE_M; + n = SIZE_N; + k = SIZE_K; + + printf("Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n", m, k, k, n, m, n); + + int sizeA = m * k; + int sizeB = k * n; + int sizeC = m * n; + + // Make matrix + DATA_TYPE* A = NULL, * B = NULL; + allocNinitMem(&A, sizeA); + allocNinitMem(&B, sizeB); + + DATA_TYPE* Ccpu = NULL, * Cgpu = NULL; + allocNinitMem(&Ccpu, sizeC); + allocNinitMem(&Cgpu, sizeC); + + // generate input matrices + for (int i = 0; i < sizeA; i++) A[i] = ((rand() % 10) + ((rand() % 100) / 100.0)); + for (int i = 0; i < sizeB; i++) B[i] = ((rand() % 10) + ((rand() % 100) / 100.0)); + + // CPU version (OpenMP) + timer.onTimer(0); + for (int row = 0; row < m; row++) { + for (int col = 0; col < n; col++) { + int cIndex = row * n + col; + Ccpu[cIndex] = 0; + for (int i = 0; i < k; i++) + Ccpu[cIndex] += (A[row * k + i] * B[i * n + col]); + } + } + printf("CPU finished!\n"); + timer.offTimer(0); + + // GPU setup + DATA_TYPE* dA, * dB, * dC; + + cudaMalloc(&dA, sizeA * sizeof(DATA_TYPE)); + cudaMemset(dA, 0, sizeA * sizeof(DATA_TYPE)); + + cudaMalloc(&dB, sizeB * sizeof(DATA_TYPE)); + cudaMemset(dB, 0, sizeB * sizeof(DATA_TYPE)); + + cudaMalloc(&dC, sizeC * sizeof(DATA_TYPE)); + cudaMemset(dC, 0, sizeC * sizeof(DATA_TYPE)); + + timer.onTimer(1); + + timer.onTimer(4); + cudaMemcpy(dA, A, sizeA * sizeof(DATA_TYPE), cudaMemcpyHostToDevice); + cudaMemcpy(dB, B, sizeB * sizeof(DATA_TYPE), cudaMemcpyHostToDevice); + timer.offTimer(4); + + /****************************************************************** + * Write your codes for GPU algorithm from here + ******************************************************************/ + // Sharead memroy version + + // 1. set the thread layout + // Change the layout if you need + dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); + printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y); + + // 2. kernel call + timer.onTimer(3); + MatMul_SharedMem <<>> (dA, dB, dC, m, n, k); + cudaDeviceSynchronize(); + timer.offTimer(3); + + /****************************************************************** + ******************************************************************/ + + timer.onTimer(5); + cudaMemcpy(Cgpu, dC, sizeC * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost); + timer.offTimer(5); + + timer.offTimer(1); + + // Basci version + runMatMul_Basic(dA, dB, dC, m, n, k); + + cudaFree(dA); + cudaFree(dB); + cudaFree(dC); + + printf("[Kernel (shared memroy)] "); + compareMatrix(Ccpu, Cgpu, sizeC); + + timer.printTimer(1); + + delete A; + delete B; + delete Ccpu; + delete Cgpu; + + return 0; +} + +bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size) +{ + bool isMatched = true; + for (int i = 0; i < _size; i++) { + if (!IS_EQUAL(_A[i], _B[i])) { + printf("[%d] not matched! (%f, %f)\n", i, _A[i], _B[i]); + getchar(); + isMatched = false; + } + } + if (isMatched) + printf("Results are matched!\n"); + else + printf("Results are not matched!!!!!!!!!!!\n"); + + return isMatched; +} + +__global__ void MatMul(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = blockDim.y * blockIdx.y + threadIdx.y; + + if (row >= m || col >= n) + return; + + DATA_TYPE val = 0; // hope to use register + for (int i = 0; i < k; i++) + val += matA[ID2INDEX(row, i, k)] * matB[ID2INDEX(i, col, n)]; + + matC[ID2INDEX(row, col, n)] = val; +} + +void runMatMul_Basic(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); + + timer.onTimer(7); + MatMul <<< gridDim, blockDim >>> (matA, matB, matC, m, n, k); + cudaDeviceSynchronize(); + timer.offTimer(7); + + cudaMemset(matC, 0, m * n * sizeof(DATA_TYPE)); +} + +template +void allocNinitMem(T** p, long long size, DATA_TYPE* memUsage) { + *p = new T[size]; + memset(*p, 0, sizeof(T) * size); + + if (memUsage != NULL) { + *memUsage += sizeof(T) * size; + } +} + +void setTimer() +{ + timer.setTimerName(0, (char*)"CPU algorithm"); + timer.setTimerName(1, (char*)"GPU/CUDA algorithm"); + timer.setTimerName(3, (char*)" - Kernel (Shared memory)"); + timer.setTimerName(4, (char*)" - [Data transter] host->device"); + timer.setTimerName(5, (char*)" - [Data transfer] device->host"); + timer.setTimerName(7, (char*)"Kernel (Basic)"); +} \ No newline at end of file diff --git a/CUDA_Programming.sln b/CUDA_Programming.sln index a1bcd19..d1d46ba 100644 --- a/CUDA_Programming.sln +++ b/CUDA_Programming.sln @@ -141,6 +141,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Chap10", "Chap10", "{E30E18 EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "10_1_MatMul_SharedMem_skeleton", "VS2019\Book_BJ\Chap10\10_1_MatMul_SharedMem_skeleton\10_1_MatMul_SharedMem_skeleton.vcxproj", "{96415E5F-E9D8-4A05-A2D0-AEDFC695F813}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "10_6_MatMul_SharedMem", "VS2019\Book_BJ\Chap10\10_6_MatMul_SharedMem\10_6_MatMul_SharedMem.vcxproj", "{5EFF6068-FB20-413D-922B-097B00823572}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -419,6 +421,12 @@ Global {96415E5F-E9D8-4A05-A2D0-AEDFC695F813}.Release|x64.ActiveCfg = Release|x64 {96415E5F-E9D8-4A05-A2D0-AEDFC695F813}.Release|x64.Build.0 = Release|x64 {96415E5F-E9D8-4A05-A2D0-AEDFC695F813}.Release|x86.ActiveCfg = Release|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Debug|x64.ActiveCfg = Debug|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Debug|x64.Build.0 = Debug|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Debug|x86.ActiveCfg = Debug|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Release|x64.ActiveCfg = Release|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Release|x64.Build.0 = Release|x64 + {5EFF6068-FB20-413D-922B-097B00823572}.Release|x86.ActiveCfg = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -486,6 +494,7 @@ Global {395AC7FD-9827-4811-AF26-7E1307FD028D} = {85D65ED0-1301-4A36-A391-938C22BC6F81} {E30E18B5-4C18-477A-A096-D9506E0563A3} = {5C5F8058-87FE-4819-9109-A071CD239ED0} {96415E5F-E9D8-4A05-A2D0-AEDFC695F813} = {E30E18B5-4C18-477A-A096-D9506E0563A3} + {5EFF6068-FB20-413D-922B-097B00823572} = {E30E18B5-4C18-477A-A096-D9506E0563A3} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {52DE253B-6715-4A93-BA27-7131973681F9} diff --git a/Intermediate_Class/Lecture4/MatMul_sharedMem/MatMul_sharedMem.cu b/Intermediate_Class/Lecture4/MatMul_sharedMem/MatMul_sharedMem.cu index e2d1530..5134905 100644 --- a/Intermediate_Class/Lecture4/MatMul_sharedMem/MatMul_sharedMem.cu +++ b/Intermediate_Class/Lecture4/MatMul_sharedMem/MatMul_sharedMem.cu @@ -37,18 +37,18 @@ __global__ void MatMul_SharedMem(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* ma int localCol = threadIdx.y; for (int bID = 0; bID < ceil((float)k / BLOCK_SIZE); bID++) { - int offset = bID * BLOCK_SIZE; + int stride = bID * BLOCK_SIZE; // load A and B - if (row >= m || offset + localCol >= k) + if (row >= m || stride + localCol >= k) subA[localRow][localCol] = 0; else - subA[localRow][localCol] = matA[row * k + (offset + localCol)]; + subA[localRow][localCol] = matA[row * k + (stride + localCol)]; - if (col >= n || offset + localRow >= k) + if (col >= n || stride + localRow >= k) subB[localRow][localCol] = 0; else - subB[localRow][localCol] = matB[(offset + localRow) * n + col]; + subB[localRow][localCol] = matB[(stride + localRow) * n + col]; __syncthreads(); diff --git a/VS2019/Book_BJ/Chap10/10_6_MatMul_SharedMem/10_6_MatMul_SharedMem.vcxproj b/VS2019/Book_BJ/Chap10/10_6_MatMul_SharedMem/10_6_MatMul_SharedMem.vcxproj new file mode 100644 index 0000000..39325da --- /dev/null +++ b/VS2019/Book_BJ/Chap10/10_6_MatMul_SharedMem/10_6_MatMul_SharedMem.vcxproj @@ -0,0 +1,93 @@ + + + + + Debug + x64 + + + Release + x64 + + + + {5EFF6068-FB20-413D-922B-097B00823572} + _10_6_MatMul_SharedMem + + + + Application + true + MultiByte + v142 + + + Application + false + true + MultiByte + v142 + + + + + + + + + + + + + + true + + + + Level3 + Disabled + WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + 64 + + + + + Level3 + MaxSpeed + true + true + WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + 64 + + + + + + + + + + + + + + + + + \ No newline at end of file