From 78595afa7bfb5175c93152edfe87d5fc92586e39 Mon Sep 17 00:00:00 2001 From: Duksu Kim Date: Tue, 12 Oct 2021 20:39:44 +0900 Subject: [PATCH] Update lec5 --- CUDA_Programming.sln | 13 +- .../MatMul_MemAccessPattern.cu | 11 +- .../MatMul_back_conflict/DS_definitions.h | 197 ++++++++++ .../MatMul_back_conflict/DS_timer.cpp | 336 ++++++++++++++++++ .../Lecture5/MatMul_back_conflict/DS_timer.h | 95 +++++ .../MatMul_back_conflict.cu | 256 +++++++++++++ .../MatMul_bank_conflict.vcxproj | 158 ++++++++ .../MatMul_bank_conflict.vcxproj.filters | 13 + 8 files changed, 1073 insertions(+), 6 deletions(-) create mode 100644 Intermediate_Class/Lecture5/MatMul_back_conflict/DS_definitions.h create mode 100644 Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.cpp create mode 100644 Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.h create mode 100644 Intermediate_Class/Lecture5/MatMul_back_conflict/MatMul_back_conflict.cu create mode 100644 VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj create mode 100644 VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj.filters diff --git a/CUDA_Programming.sln b/CUDA_Programming.sln index 312be14..b0779a6 100644 --- a/CUDA_Programming.sln +++ b/CUDA_Programming.sln @@ -49,6 +49,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Lecture5", "Lecture5", "{EF EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatMul_MemAccessPattern", "VS2019\Intermediate_Class\Lecture5\MatMul_MemAccessPattern\MatMul_MemAccessPattern.vcxproj", "{78E9C712-3A19-4402-A9F6-50A621258682}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatMul_bank_conflict", "VS2019\Intermediate_Class\Lecture5\MatMul_bank_conflict\MatMul_bank_conflict.vcxproj", "{8D608736-D0A2-4D23-BED7-641B87D5F4C1}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -139,6 +141,14 @@ Global {78E9C712-3A19-4402-A9F6-50A621258682}.Release|x64.Build.0 = Release|x64 {78E9C712-3A19-4402-A9F6-50A621258682}.Release|x86.ActiveCfg = Release|Win32 {78E9C712-3A19-4402-A9F6-50A621258682}.Release|x86.Build.0 = Release|Win32 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Debug|x64.ActiveCfg = Debug|x64 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Debug|x64.Build.0 = Debug|x64 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Debug|x86.ActiveCfg = Debug|Win32 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Debug|x86.Build.0 = Debug|Win32 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Release|x64.ActiveCfg = Release|x64 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Release|x64.Build.0 = Release|x64 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Release|x86.ActiveCfg = Release|Win32 + {8D608736-D0A2-4D23-BED7-641B87D5F4C1}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -154,7 +164,7 @@ Global {F7C0847C-FB6E-4EEA-9F41-6496E6A9D1D1} = {694AE413-7EF6-4E09-A6FB-F0DB09C6C958} {847E94AE-AF96-4A3B-A751-FC11A7D5E0B9} = {694AE413-7EF6-4E09-A6FB-F0DB09C6C958} {CE6F28BA-60F6-4E0E-ACA0-D6D96E0B0351} = {BBC3837C-5486-4B81-92F4-2ADAE8696667} - {02359BC0-0118-4B16-B441-36814F5E71BB} = {BBC3837C-5486-4B81-92F4-2ADAE8696667} + {02359BC0-0118-4B16-B441-36814F5E71BB} = {94C2BE3E-F051-43B9-9C46-C56D1D2D5B0D} {94C2BE3E-F051-43B9-9C46-C56D1D2D5B0D} = {BBC3837C-5486-4B81-92F4-2ADAE8696667} {9D888272-7A7A-4696-92CF-4AAD7B154C99} = {CE6F28BA-60F6-4E0E-ACA0-D6D96E0B0351} {3EFA4D0E-3D17-4785-AC1A-DA9CE3CD7E2B} = {CE6F28BA-60F6-4E0E-ACA0-D6D96E0B0351} @@ -165,6 +175,7 @@ Global {22D65404-3842-48CD-BCA3-FABACECE205F} = {FF3FE1FF-AC94-4E2C-9C52-FD2318512EE2} {EF9A42B1-29F7-4349-827D-08B3B5D95ECA} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1} {78E9C712-3A19-4402-A9F6-50A621258682} = {EF9A42B1-29F7-4349-827D-08B3B5D95ECA} + {8D608736-D0A2-4D23-BED7-641B87D5F4C1} = {EF9A42B1-29F7-4349-827D-08B3B5D95ECA} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {52DE253B-6715-4A93-BA27-7131973681F9} diff --git a/Intermediate_Class/Lecture5/MatMul_MemAccessPattern/MatMul_MemAccessPattern.cu b/Intermediate_Class/Lecture5/MatMul_MemAccessPattern/MatMul_MemAccessPattern.cu index f8b1ca0..262f06c 100644 --- a/Intermediate_Class/Lecture5/MatMul_MemAccessPattern/MatMul_MemAccessPattern.cu +++ b/Intermediate_Class/Lecture5/MatMul_MemAccessPattern/MatMul_MemAccessPattern.cu @@ -92,18 +92,19 @@ int main(int argc, char* argv[]) cudaMemcpy(dA, A, sizeA * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dB, B, sizeB * sizeof(int), cudaMemcpyHostToDevice); - dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); - dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); - // Row = X-dim version timer.onTimer(1); - MatMul_xRow<<>> (dA, dB, dC, m, n, k); + dim3 gridDim_xRow(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); + dim3 blockDim_xRow(BLOCK_SIZE, BLOCK_SIZE); + MatMul_xRow<<>> (dA, dB, dC, m, n, k); cudaDeviceSynchronize(); timer.offTimer(1); // Row = Y-dim version timer.onTimer(2); - MatMul_yRow<< > > (dA, dB, dC, m, n, k); + dim3 gridDim_yRow(ceil((float)n / BLOCK_SIZE), ceil((float)m / BLOCK_SIZE)); + dim3 blockDim_yRow(BLOCK_SIZE, BLOCK_SIZE); + MatMul_yRow<<>> (dA, dB, dC, m, n, k); cudaDeviceSynchronize(); timer.offTimer(2); diff --git a/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_definitions.h b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_definitions.h new file mode 100644 index 0000000..e7a6816 --- /dev/null +++ b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_definitions.h @@ -0,0 +1,197 @@ +#pragma once + +#include +#include +#include +#include + +#define OS_WINDOWS 0 +#define OS_LINUX 1 + +#ifdef _WIN32 +#define _TARGET_OS OS_WINDOWS +#else + #ifndef nullptr + #define nullptr NULL + #endif + #define _TARGET_OS OS_LINUX +#endif + +/************************************************************************/ +/* OS dependet function */ +/************************************************************************/ +#if _TARGET_OS == OS_WINDOWS +// #define _SPRINT sprintf_s +#define _STRTOK strtok_s + +#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);} + +#define SPLIT_PATH(_path,_result) \ + _splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255) + + +#elif _TARGET_OS == OS_LINUX +#include +#include + +#define _STRTOK strtok_r + +#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);} + +#define sprintf_s sprintf +#define scanf_s scanf +#define fprintf_s fprintf + +#define __int64 int64_t + +#define fopen_s(fp, name, mode) (*fp = fopen(name, mode)) + +#endif + +/************************************************************************/ +/* Defines */ +/************************************************************************/ + +// *********** data size +#define _1K_ 1024 +#define _1M_ (_1K_*_1K_) +#define _1G_ (_1M_*_1K_) + +#define CHAR_STRING_SIZE 255 + +/************************************************************************/ +/* Type definitions */ +/************************************************************************/ +typedef unsigned int UINT ; + +/************************************************************************/ +/* Macro functions */ +/************************************************************************/ +#define DS_MEM_DELETE(a) \ + if (a != NULL) { \ + delete a ; \ + a = NULL ; \ + } + +#define DS_MEM_DELETE_ARRAY(a) \ + if (a != NULL) { \ + delete [] a ; \ + a = NULL ; \ + } + +#define RANGE_MIN 0 +#define RANGE_MAX 1 + +#define MATCHED_STRING 0 + +#ifndef VTK_RANGE_MIN +#define VTK_RANGE_MIN 0 +#define VTK_RANGE_MAX 1 +#endif + +// Print +#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__) +#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);} + +// Single loops +#define LOOP_I(a) for(int i=0; i +void SWAP(T &a, T &b){ + T tmp = a; + a = b; + b = tmp; +} +#endif + +// +#ifndef MIN +#define MIN(a,b) (a > b ? b : a) +#endif + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +// Index converter + +#define INDEX2X(_ID,_W) (_ID%_W) +#define INDEX2Y(_ID,_W) (_ID/_W) +#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);} +#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X) +#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base) + +// Memory allocation and release +#ifndef SAFE_DELETE +#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;} +#endif + +#ifndef SAFE_DELETE_ARR +#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;} +#endif + +#define SAFE_NEW(p, type, size) {\ + try {p = new type[size];} \ + catch(std::bad_alloc& exc) \ + { printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \ + EXIT_WIHT_KEYPRESS }\ + } + +template +void memsetZero(T** p, long long size = 0) { + if (*p != NULL) + memset(*p, 0, sizeof(T)*size); +} + +template +void allocNinitMem(T** p, long long size, double *memUsage = NULL) { + *p = new T[size]; + //SAFE_NEW(*p, T, size); + memset(*p, 0, sizeof(T)*size); + + if (memUsage != NULL) { + *memUsage += sizeof(T)*size; + } +} + +#define SAFE_MEMCPY(_dst, _src, _type, _size){ \ + if(_dst == nullptr || _src == nullptr ) \ + printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \ + exit(-1); \ + memcpy(_dst, _src, sizeof(_type)*_size);\ +} + +// VTK related +#ifndef SAFE_DELETE_VTK +#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;} +#endif + +#ifndef VTK_IS_NOERROR +//#include "DS_common_def.h" +#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false) +#endif + +/************************************************************************/ +/* Data structures */ +/************************************************************************/ +typedef struct { + std::string input; + std::string output; +} nameMatch; + +typedef struct { + char drive[255]; + char dir[255]; + char filename[255]; + char ext[255]; +} filePathSplit; diff --git a/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.cpp b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.cpp new file mode 100644 index 0000000..876a2c0 --- /dev/null +++ b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.cpp @@ -0,0 +1,336 @@ +#include "DS_timer.h" +#include "DS_definitions.h" +#include + +//#ifndef _WIN32 +//#define fprintf_s fprintf +//void fopen_s(FILE** fp, char* name, char* mode) +//{ +// *fp = fopen(name, mode); +//} +//#endif + +/************************************************************************/ +/* Constructor & Destructor */ +/************************************************************************/ +DS_timer::DS_timer(int _numTimer /* =0 */, int _numCount /* =0 */, bool _trunOn /* =true */){ + + turnOn = _trunOn ; + start_ticks = NULL ; end_ticks = NULL ; totalTicks = NULL ; + counters = NULL ; + timerStates = NULL ; + timerName = NULL ; + + numTimer = numCounter = 0 ; + + setTimerTitle((char*)"DS_timer Report") ; + + setTimer(_numTimer ) ; + setCounter( _numTimer ) ; + +#ifdef _WIN32 + // For windows + QueryPerformanceFrequency(&ticksPerSecond) ; +#endif +} + +DS_timer::~DS_timer(void){ + releaseTimers() ; + releaseCounters() ; +} + +/************************************************************************/ +/* Set & Get configurations */ +/************************************************************************/ + +// Memory allocation +void DS_timer::memAllocCounters( void ) { + releaseCounters() ; + counters = new UINT[numCounter] ; + initCounters() ; +} + +void DS_timer::memAllocTimers( void ) { + releaseTimers() ; + + timerStates = new bool[numTimer] ; + start_ticks = new TIME_VAL[numTimer]; + end_ticks = new TIME_VAL[numTimer]; + totalTicks = new TIME_VAL[numTimer]; + + // Initialize + memset(timerStates, 0, sizeof(bool)*numTimer); + memset(start_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(end_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(totalTicks, 0, sizeof(TIME_VAL)*numTimer); + + timerName = new std::string[numTimer] ; + for ( UINT i = 0 ; i < numTimer ; i++ ) { + timerName[i].clear() ; + timerName[i].resize(255) ; + } +} + +// Memory release +void DS_timer::releaseCounters( void ) { DS_MEM_DELETE_ARRAY(counters) ; } +void DS_timer::releaseTimers( void ) { + //DS_MEM_DELETE(start_ticks) ; DS_MEM_DELETE(end_ticks) ; DS_MEM_DELETE(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerStates) ; + DS_MEM_DELETE_ARRAY(start_ticks) ; + DS_MEM_DELETE_ARRAY(end_ticks) ; + DS_MEM_DELETE_ARRAY(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerName) ; +} + +// Getters +UINT DS_timer::getNumTimer( void ) { return numTimer ; } +UINT DS_timer::getNumCounter( void ) { return numCounter ; } + +// Setters +UINT DS_timer::setTimer( UINT _numTimer ) { + if ( _numTimer == 0 ) + return 0 ; + + if (_numTimer <= numTimer) + return numTimer; + + if ( numTimer != 0 ) { + + // Backup + UINT oldNumTimer = numTimer ; + TIME_VAL *oldTotalTicks = new TIME_VAL[oldNumTimer]; + memcpy(oldTotalTicks, totalTicks, sizeof(TIME_VAL)*oldNumTimer); + + numTimer = _numTimer ; + memAllocTimers() ; + + memcpy(totalTicks, oldTotalTicks, sizeof(TIME_VAL)* oldNumTimer); + delete oldTotalTicks ; + } else { + numTimer = _numTimer ; + memAllocTimers() ; + } + + return _numTimer ; +} + +UINT DS_timer::setCounter( UINT _numCounter ) { + + if (_numCounter == 0 ) + return 0 ; + + if (_numCounter <= numCounter) + return numCounter; + + if ( numCounter != 0 ) { + + // Backup + int numOldCounter = numCounter ; + UINT *oldCounters = new UINT[numOldCounter] ; + memcpy(oldCounters, counters, sizeof(UINT)*numOldCounter) ; + + numCounter = _numCounter ; + memAllocCounters() ; + + // Restore + memcpy(counters, oldCounters, sizeof(UINT)*numOldCounter) ; + delete oldCounters ; + + } else { + numCounter = _numCounter ; + memAllocCounters() ; + } + + return numCounter ; + +} + +/************************************************************************/ +/* Timer */ +/************************************************************************/ +void DS_timer::initTimer( UINT id ) { + timerStates[id] = TIMER_OFF ; +#ifdef _WIN32 + totalTicks[id].QuadPart = 0 ; +#else + totalTicks[id].tv_sec = 0; + totalTicks[id].tv_usec = 0; +#endif +} + +void DS_timer::initTimers( void ) { + for ( UINT i = 0 ; i < numTimer ; i++ ) { + initTimer(i); + } +} + +void DS_timer::onTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_ON ) + return ; +#ifdef _WIN32 + QueryPerformanceCounter(&start_ticks[id]) ; +#else + gettimeofday(&start_ticks[id], NULL); +#endif + + timerStates[id] = TIMER_ON ; +} + +void DS_timer::offTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_OFF ) + return ; + +#ifdef _WIN32 + QueryPerformanceCounter(&end_ticks[id]) ; + totalTicks[id].QuadPart = totalTicks[id].QuadPart + (end_ticks[id].QuadPart - start_ticks[id].QuadPart) ; +#else + gettimeofday(&end_ticks[id], NULL); + TIME_VAL period, previous; + timersub(&end_ticks[id], &start_ticks[id], &period); + previous = totalTicks[id]; + timeradd(&previous, &period, &totalTicks[id]); +#endif + + timerStates[id] = TIMER_OFF ; +} + +double DS_timer::getTimer_ms( UINT id ) { +#ifdef _WIN32 + return ((double)totalTicks[id].QuadPart/(double)ticksPerSecond.QuadPart * 1000) ; +#else + return (double)(totalTicks[id].tv_sec * 1000 + totalTicks[id].tv_usec / 1000.0); +#endif +} + +/************************************************************************/ +/* Counter */ +/************************************************************************/ +void DS_timer::incCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id]++ ; +} + +void DS_timer::initCounters( void ) { + if ( turnOn == false ) + return ; + + for ( UINT i = 0 ; i < numCounter ; i++ ) + counters[i] = 0 ; +} + +void DS_timer::initCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id] = 0 ; +} + +void DS_timer::add2Counter( UINT id, UINT num ) { + if ( turnOn == false ) + return ; + + counters[id] += num ; +} + +UINT DS_timer::getCounter( UINT id ) { + if ( turnOn == false ) + return 0; + + return counters[id] ; +} + +/************************************************************************/ +/* */ +/************************************************************************/ +void DS_timer::printTimer( float _denominator){ + + if ( turnOn == false ) + return ; + + //printf("\n*\t DS_timer Report \t*\n") ; + printf("\n*\t %s \t*\n", timerTitle) ; + printf("* The number of timer = %d, counter = %d\n", numTimer, numCounter ) ; + printf("**** Timer report ****\n") ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + if ( timerName[i].c_str()[0] == 0 ) + printf("Timer %d : %.5f ms\n", i, getTimer_ms(i) ) ; + else + printf("%s : %.5f ms\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + printf("**** Counter report ****\n") ; + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + printf("Counter %d : %.3f (%d) \n",i, counters[i]/_denominator, counters[i] ) ; + } + + printf("*\t End of the report \t*\n") ; +} + +void DS_timer::printToFile( char* fileName, int _id ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_report.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + if ( _id >= 0 ) + fprintf_s(fp,"%d\t", _id) ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + fprintf_s(fp, "%s: %.9f\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + fprintf_s(fp, "%s: %d\n", timerName[i].c_str(), counters[i] ) ; + } + + fprintf_s(fp, "\n") ; + + fclose(fp) ; +} + +void DS_timer::printTimerNameToFile( char* fileName ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_name.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( timerName[i].empty() ) + continue ; + fprintf_s(fp, "%s\t", timerName[i].c_str() ) ; + } + fprintf_s(fp, "\n") ; + fclose(fp) ; +} diff --git a/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.h b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.h new file mode 100644 index 0000000..d1c3391 --- /dev/null +++ b/Intermediate_Class/Lecture5/MatMul_back_conflict/DS_timer.h @@ -0,0 +1,95 @@ +//#pragma once +#ifndef _DS_TIMER_H +#define _DS_TIMER_H + +#include // std string + +#ifndef UINT +typedef unsigned int UINT; +#endif + +#ifdef _WIN32 + // For windows + #include + typedef LARGE_INTEGER TIME_VAL; +#else + // For Unix/Linux + #include + #include + #include + #include // c string + typedef struct timeval TIME_VAL; +#endif + +#define TIMER_ON true +#define TIMER_OFF false + +class DS_timer +{ +private : + + bool turnOn ; + + UINT numTimer ; + UINT numCounter ; + + // For timers + bool* timerStates ; + TIME_VAL ticksPerSecond; + TIME_VAL *start_ticks; + TIME_VAL *end_ticks; + TIME_VAL *totalTicks; + + char timerTitle[255] ; + std::string *timerName ; + + // For counters + UINT *counters ; + + void memAllocCounters ( void ) ; + void memAllocTimers ( void ) ; + void releaseCounters ( void ) ; + void releaseTimers ( void ) ; + +public: + DS_timer(int _numTimer = 1, int _numCount = 1, bool _trunOn = true ); + ~DS_timer(void); + + // For configurations + inline void timerOn ( void ) { turnOn = TIMER_ON ; } + inline void timerOff ( void ) { turnOn = TIMER_OFF ; } + + UINT getNumTimer( void ) ; + UINT getNumCounter ( void ) ; + UINT setTimer ( UINT _numTimer ) ; + UINT setCounter ( UINT _numCounter ) ; + + // For timers + + void initTimer(UINT id) ; + void initTimers ( void ); + void onTimer(UINT id) ; + void offTimer(UINT id ) ; + double getTimer_ms(UINT id) ; + + void setTimerTitle ( char* _name ) { memset(timerTitle, 0, sizeof(char)*255) ; memcpy(timerTitle, _name, strlen(_name)) ; } + + void setTimerName (UINT id, std::string &_name) { timerName[id] = _name ; } + void setTimerName (UINT id, char* _name) { timerName[id] = _name ;} + + // For counters + + void incCounter(UINT id) ; + void initCounters( void ) ; + void initCounter(UINT id) ; + void add2Counter( UINT id, UINT num ) ; + UINT getCounter ( UINT id ) ; + + // For reports + + void printTimer ( float _denominator = 1 ) ; + void printToFile ( char* fileName, int _id = -1 ) ; + void printTimerNameToFile ( char* fileName ) ; +} ; + +#endif \ No newline at end of file diff --git a/Intermediate_Class/Lecture5/MatMul_back_conflict/MatMul_back_conflict.cu b/Intermediate_Class/Lecture5/MatMul_back_conflict/MatMul_back_conflict.cu new file mode 100644 index 0000000..28c29d7 --- /dev/null +++ b/Intermediate_Class/Lecture5/MatMul_back_conflict/MatMul_back_conflict.cu @@ -0,0 +1,256 @@ +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include "DS_timer.h" +#include +#include +#include + +#define DATA_TYPE int + +#define SIZE_M (512*2) +#define SIZE_N (512*4) +#define SIZE_K (512*2) + +#define INDEX2ROW(_index,_width) (int)((_index)/(_width)) +#define INDEX2COL(_index,_width) ((_index)%(_width)) +#define ID2INDEX(_row,_col, _width) (((_row)*(_width))+(_col)) + +#define BLOCK_SIZE 16 + +// macro function +#define IS_EQUAL(_a, _b) (abs(_b - _a) < 10e-6) + +// Kernel - shared memory +__global__ void MatMul_SharedMem(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = blockDim.y * blockIdx.y + threadIdx.y; + + DATA_TYPE val = 0; + __shared__ DATA_TYPE subA[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ DATA_TYPE subB[BLOCK_SIZE][BLOCK_SIZE]; + + int localRow = threadIdx.x; + int localCol = threadIdx.y; + + for (int bID = 0; bID < ceil((float)k / BLOCK_SIZE); bID++) { + int offset = bID * BLOCK_SIZE; + + // load A and B + if (row >= m || offset + localCol >= k) + subA[localRow][localCol] = 0; + else + subA[localRow][localCol] = matA[row * k + (offset + localCol)]; + + if (col >= n || offset + localRow >= k) + subB[localRow][localCol] = 0; + else + subB[localRow][localCol] = matB[(offset + localRow) * n + col]; + + __syncthreads(); + + // compute + for (int i = 0; i < BLOCK_SIZE; i++) { + val += subA[localRow][i] * subB[i][localCol]; + } + __syncthreads(); + } + + if (row >= m || col >= n) + return; + + matC[row * n + col] = val; +} + +// Kernel - shared memory & avoid bank conflict +__global__ void MatMul_NoBankConflict(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = blockDim.y * blockIdx.y + threadIdx.y; + + DATA_TYPE val = 0; + __shared__ DATA_TYPE subA[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ DATA_TYPE subB[BLOCK_SIZE][BLOCK_SIZE]; + + int localRow = threadIdx.x; + int localCol = threadIdx.y; + + for (int bID = 0; bID < ceil((float)k / BLOCK_SIZE); bID++) { + int offset = bID * BLOCK_SIZE; + + // load A and B + if (row >= m || offset + localCol >= k) + subA[localCol][localRow] = 0; + else + subA[localCol][localRow] = matA[row * k + (offset + localCol)]; + + if (col >= n || offset + localRow >= k) + subB[localRow][localCol] = 0; + else + subB[localRow][localCol] = matB[(offset + localRow) * n + col]; + + __syncthreads(); + + // compute + for (int i = 0; i < BLOCK_SIZE; i++) { + val += subA[i][localRow] * subB[i][localCol]; + } + __syncthreads(); + } + + if (row >= m || col >= n) + return; + + matC[row * n + col] = val; +} + +template void allocNinitMem(T** p, long long size, DATA_TYPE* memUsage = NULL); +void runMatMul_Basic(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k); +bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size); + +DS_timer timer(10); +void setTimer(); + +int main(int argc, char* argv[]) +{ + setTimer(); + + // set matrix size + int m, n, k; + m = SIZE_M; + n = SIZE_N; + k = SIZE_K; + + printf("Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n", m, k, k, n, m, n); + + int sizeA = m * k; + int sizeB = k * n; + int sizeC = m * n; + + // Make matrix + DATA_TYPE* A = NULL, * B = NULL; + allocNinitMem(&A, sizeA); + allocNinitMem(&B, sizeB); + + DATA_TYPE* Ccpu = NULL, * Cgpu = NULL; + allocNinitMem(&Ccpu, sizeC); + allocNinitMem(&Cgpu, sizeC); + + // generate input matrices + for (int i = 0; i < sizeA; i++) A[i] = ((rand() % 10) + ((rand() % 100) / 100.0)); + for (int i = 0; i < sizeB; i++) B[i] = ((rand() % 10) + ((rand() % 100) / 100.0)); + + // GPU setup + DATA_TYPE* dA, * dB, * dC; + + cudaMalloc(&dA, sizeA * sizeof(DATA_TYPE)); + cudaMemset(dA, 0, sizeA * sizeof(DATA_TYPE)); + + cudaMalloc(&dB, sizeB * sizeof(DATA_TYPE)); + cudaMemset(dB, 0, sizeB * sizeof(DATA_TYPE)); + + cudaMalloc(&dC, sizeC * sizeof(DATA_TYPE)); + cudaMemset(dC, 0, sizeC * sizeof(DATA_TYPE)); + + cudaMemcpy(dA, A, sizeA * sizeof(DATA_TYPE), cudaMemcpyHostToDevice); + cudaMemcpy(dB, B, sizeB * sizeof(DATA_TYPE), cudaMemcpyHostToDevice); + + dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); + printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y); + + // No shared memory version + timer.onTimer(0); + runMatMul_Basic(dA, dB, dC, m, n, k); + timer.offTimer(0); + + // Bank conflict version + timer.onTimer(1); + MatMul_SharedMem <<>> (dA, dB, dC, m, n, k); + cudaDeviceSynchronize(); + timer.offTimer(1); + + // No bank conflict version + timer.onTimer(2); + MatMul_NoBankConflict <<>> (dA, dB, dC, m, n, k); + cudaDeviceSynchronize(); + timer.offTimer(2); + + + cudaMemcpy(Cgpu, dC, sizeC * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost); + + cudaFree(dA); + cudaFree(dB); + cudaFree(dC); + + timer.printTimer(1); + + delete A; + delete B; + delete Ccpu; + delete Cgpu; + + return 0; +} + +bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size) +{ + bool isMatched = true; + for (int i = 0; i < _size; i++) { + if (!IS_EQUAL(_A[i], _B[i])) { + printf("[%d] not matched! (%f, %f)\n", i, _A[i], _B[i]); + getchar(); + isMatched = false; + } + } + if (isMatched) + printf("Results are matched!\n"); + else + printf("Results are not matched!!!!!!!!!!!\n"); + + return isMatched; +} + +__global__ void MatMul(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = blockDim.y * blockIdx.y + threadIdx.y; + + if (row >= m || col >= n) + return; + + DATA_TYPE val = 0; // hope to use register + for (int i = 0; i < k; i++) + val += matA[ID2INDEX(row, i, k)] * matB[ID2INDEX(i, col, n)]; + + matC[ID2INDEX(row, col, n)] = val; +} + +void runMatMul_Basic(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k) +{ + dim3 gridDim(ceil((float)m / BLOCK_SIZE), ceil((float)n / BLOCK_SIZE)); + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); + + MatMul <<< gridDim, blockDim >>> (matA, matB, matC, m, n, k); + cudaDeviceSynchronize(); + + cudaMemset(matC, 0, m * n * sizeof(DATA_TYPE)); +} + +template +void allocNinitMem(T** p, long long size, DATA_TYPE* memUsage) { + *p = new T[size]; + memset(*p, 0, sizeof(T) * size); + + if (memUsage != NULL) { + *memUsage += sizeof(T) * size; + } +} + +void setTimer() +{ + timer.setTimerName(0, (char*)"Kernel (basic)"); + timer.setTimerName(1, (char*)"Kernel (shared memory with bank conflict)"); + timer.setTimerName(2, (char*)"Kernel (shared memory avoiding bank conflict)"); +} \ No newline at end of file diff --git a/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj b/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj new file mode 100644 index 0000000..c6d44ea --- /dev/null +++ b/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj @@ -0,0 +1,158 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {8d608736-d0a2-4d23-bed7-641b87d5f4c1} + MatMulbankconflict + 10.0 + + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + true + + + false + + + true + + + false + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + + + + + + Document + + + + + + + \ No newline at end of file diff --git a/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj.filters b/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj.filters new file mode 100644 index 0000000..3158a4b --- /dev/null +++ b/VS2019/Intermediate_Class/Lecture5/MatMul_bank_conflict/MatMul_bank_conflict.vcxproj.filters @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file