diff --git a/CUDA_Programming.sln b/CUDA_Programming.sln index e4959ab..7abe1cd 100644 --- a/CUDA_Programming.sln +++ b/CUDA_Programming.sln @@ -55,6 +55,12 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Lecture6", "Lecture6", "{3D EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ThreadCounting_AtomicOp", "VS2019\Intermediate_Class\Lecture6\ThreadCounting_AtomicOp\ThreadCounting_AtomicOp.vcxproj", "{E36C456D-0161-426E-B305-E119C40B480F}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Lecture7", "Lecture7", "{8C2E78EF-0AF9-4AD6-B1E3-2A3BB7DB2798}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Stream", "VS2019\Intermediate_Class\Lecture7\Stream\Stream.vcxproj", "{AFD873F0-6742-40CC-A371-FFB188BC1AC1}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Lecture8", "Lecture8", "{40E3E314-3859-4C01-BCD7-331540F33D08}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -161,6 +167,14 @@ Global {E36C456D-0161-426E-B305-E119C40B480F}.Release|x64.Build.0 = Release|x64 {E36C456D-0161-426E-B305-E119C40B480F}.Release|x86.ActiveCfg = Release|Win32 {E36C456D-0161-426E-B305-E119C40B480F}.Release|x86.Build.0 = Release|Win32 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Debug|x64.ActiveCfg = Debug|x64 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Debug|x64.Build.0 = Debug|x64 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Debug|x86.ActiveCfg = Debug|Win32 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Debug|x86.Build.0 = Debug|Win32 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x64.ActiveCfg = Release|x64 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x64.Build.0 = Release|x64 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x86.ActiveCfg = Release|Win32 + {AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -190,6 +204,9 @@ Global {8D608736-D0A2-4D23-BED7-641B87D5F4C1} = {EF9A42B1-29F7-4349-827D-08B3B5D95ECA} {3DEBB45B-FF8B-4776-BFF5-D68A48058E79} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1} {E36C456D-0161-426E-B305-E119C40B480F} = {3DEBB45B-FF8B-4776-BFF5-D68A48058E79} + {8C2E78EF-0AF9-4AD6-B1E3-2A3BB7DB2798} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1} + {AFD873F0-6742-40CC-A371-FFB188BC1AC1} = {8C2E78EF-0AF9-4AD6-B1E3-2A3BB7DB2798} + {40E3E314-3859-4C01-BCD7-331540F33D08} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {52DE253B-6715-4A93-BA27-7131973681F9} diff --git a/Intermediate_Class/Lecture7/Stream/DS_definitions.h b/Intermediate_Class/Lecture7/Stream/DS_definitions.h new file mode 100644 index 0000000..e7a6816 --- /dev/null +++ b/Intermediate_Class/Lecture7/Stream/DS_definitions.h @@ -0,0 +1,197 @@ +#pragma once + +#include +#include +#include +#include + +#define OS_WINDOWS 0 +#define OS_LINUX 1 + +#ifdef _WIN32 +#define _TARGET_OS OS_WINDOWS +#else + #ifndef nullptr + #define nullptr NULL + #endif + #define _TARGET_OS OS_LINUX +#endif + +/************************************************************************/ +/* OS dependet function */ +/************************************************************************/ +#if _TARGET_OS == OS_WINDOWS +// #define _SPRINT sprintf_s +#define _STRTOK strtok_s + +#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);} + +#define SPLIT_PATH(_path,_result) \ + _splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255) + + +#elif _TARGET_OS == OS_LINUX +#include +#include + +#define _STRTOK strtok_r + +#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);} + +#define sprintf_s sprintf +#define scanf_s scanf +#define fprintf_s fprintf + +#define __int64 int64_t + +#define fopen_s(fp, name, mode) (*fp = fopen(name, mode)) + +#endif + +/************************************************************************/ +/* Defines */ +/************************************************************************/ + +// *********** data size +#define _1K_ 1024 +#define _1M_ (_1K_*_1K_) +#define _1G_ (_1M_*_1K_) + +#define CHAR_STRING_SIZE 255 + +/************************************************************************/ +/* Type definitions */ +/************************************************************************/ +typedef unsigned int UINT ; + +/************************************************************************/ +/* Macro functions */ +/************************************************************************/ +#define DS_MEM_DELETE(a) \ + if (a != NULL) { \ + delete a ; \ + a = NULL ; \ + } + +#define DS_MEM_DELETE_ARRAY(a) \ + if (a != NULL) { \ + delete [] a ; \ + a = NULL ; \ + } + +#define RANGE_MIN 0 +#define RANGE_MAX 1 + +#define MATCHED_STRING 0 + +#ifndef VTK_RANGE_MIN +#define VTK_RANGE_MIN 0 +#define VTK_RANGE_MAX 1 +#endif + +// Print +#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__) +#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);} + +// Single loops +#define LOOP_I(a) for(int i=0; i +void SWAP(T &a, T &b){ + T tmp = a; + a = b; + b = tmp; +} +#endif + +// +#ifndef MIN +#define MIN(a,b) (a > b ? b : a) +#endif + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +// Index converter + +#define INDEX2X(_ID,_W) (_ID%_W) +#define INDEX2Y(_ID,_W) (_ID/_W) +#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);} +#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X) +#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base) + +// Memory allocation and release +#ifndef SAFE_DELETE +#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;} +#endif + +#ifndef SAFE_DELETE_ARR +#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;} +#endif + +#define SAFE_NEW(p, type, size) {\ + try {p = new type[size];} \ + catch(std::bad_alloc& exc) \ + { printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \ + EXIT_WIHT_KEYPRESS }\ + } + +template +void memsetZero(T** p, long long size = 0) { + if (*p != NULL) + memset(*p, 0, sizeof(T)*size); +} + +template +void allocNinitMem(T** p, long long size, double *memUsage = NULL) { + *p = new T[size]; + //SAFE_NEW(*p, T, size); + memset(*p, 0, sizeof(T)*size); + + if (memUsage != NULL) { + *memUsage += sizeof(T)*size; + } +} + +#define SAFE_MEMCPY(_dst, _src, _type, _size){ \ + if(_dst == nullptr || _src == nullptr ) \ + printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \ + exit(-1); \ + memcpy(_dst, _src, sizeof(_type)*_size);\ +} + +// VTK related +#ifndef SAFE_DELETE_VTK +#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;} +#endif + +#ifndef VTK_IS_NOERROR +//#include "DS_common_def.h" +#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false) +#endif + +/************************************************************************/ +/* Data structures */ +/************************************************************************/ +typedef struct { + std::string input; + std::string output; +} nameMatch; + +typedef struct { + char drive[255]; + char dir[255]; + char filename[255]; + char ext[255]; +} filePathSplit; diff --git a/Intermediate_Class/Lecture7/Stream/DS_timer.cpp b/Intermediate_Class/Lecture7/Stream/DS_timer.cpp new file mode 100644 index 0000000..876a2c0 --- /dev/null +++ b/Intermediate_Class/Lecture7/Stream/DS_timer.cpp @@ -0,0 +1,336 @@ +#include "DS_timer.h" +#include "DS_definitions.h" +#include + +//#ifndef _WIN32 +//#define fprintf_s fprintf +//void fopen_s(FILE** fp, char* name, char* mode) +//{ +// *fp = fopen(name, mode); +//} +//#endif + +/************************************************************************/ +/* Constructor & Destructor */ +/************************************************************************/ +DS_timer::DS_timer(int _numTimer /* =0 */, int _numCount /* =0 */, bool _trunOn /* =true */){ + + turnOn = _trunOn ; + start_ticks = NULL ; end_ticks = NULL ; totalTicks = NULL ; + counters = NULL ; + timerStates = NULL ; + timerName = NULL ; + + numTimer = numCounter = 0 ; + + setTimerTitle((char*)"DS_timer Report") ; + + setTimer(_numTimer ) ; + setCounter( _numTimer ) ; + +#ifdef _WIN32 + // For windows + QueryPerformanceFrequency(&ticksPerSecond) ; +#endif +} + +DS_timer::~DS_timer(void){ + releaseTimers() ; + releaseCounters() ; +} + +/************************************************************************/ +/* Set & Get configurations */ +/************************************************************************/ + +// Memory allocation +void DS_timer::memAllocCounters( void ) { + releaseCounters() ; + counters = new UINT[numCounter] ; + initCounters() ; +} + +void DS_timer::memAllocTimers( void ) { + releaseTimers() ; + + timerStates = new bool[numTimer] ; + start_ticks = new TIME_VAL[numTimer]; + end_ticks = new TIME_VAL[numTimer]; + totalTicks = new TIME_VAL[numTimer]; + + // Initialize + memset(timerStates, 0, sizeof(bool)*numTimer); + memset(start_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(end_ticks, 0, sizeof(TIME_VAL)*numTimer); + memset(totalTicks, 0, sizeof(TIME_VAL)*numTimer); + + timerName = new std::string[numTimer] ; + for ( UINT i = 0 ; i < numTimer ; i++ ) { + timerName[i].clear() ; + timerName[i].resize(255) ; + } +} + +// Memory release +void DS_timer::releaseCounters( void ) { DS_MEM_DELETE_ARRAY(counters) ; } +void DS_timer::releaseTimers( void ) { + //DS_MEM_DELETE(start_ticks) ; DS_MEM_DELETE(end_ticks) ; DS_MEM_DELETE(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerStates) ; + DS_MEM_DELETE_ARRAY(start_ticks) ; + DS_MEM_DELETE_ARRAY(end_ticks) ; + DS_MEM_DELETE_ARRAY(totalTicks) ; + DS_MEM_DELETE_ARRAY(timerName) ; +} + +// Getters +UINT DS_timer::getNumTimer( void ) { return numTimer ; } +UINT DS_timer::getNumCounter( void ) { return numCounter ; } + +// Setters +UINT DS_timer::setTimer( UINT _numTimer ) { + if ( _numTimer == 0 ) + return 0 ; + + if (_numTimer <= numTimer) + return numTimer; + + if ( numTimer != 0 ) { + + // Backup + UINT oldNumTimer = numTimer ; + TIME_VAL *oldTotalTicks = new TIME_VAL[oldNumTimer]; + memcpy(oldTotalTicks, totalTicks, sizeof(TIME_VAL)*oldNumTimer); + + numTimer = _numTimer ; + memAllocTimers() ; + + memcpy(totalTicks, oldTotalTicks, sizeof(TIME_VAL)* oldNumTimer); + delete oldTotalTicks ; + } else { + numTimer = _numTimer ; + memAllocTimers() ; + } + + return _numTimer ; +} + +UINT DS_timer::setCounter( UINT _numCounter ) { + + if (_numCounter == 0 ) + return 0 ; + + if (_numCounter <= numCounter) + return numCounter; + + if ( numCounter != 0 ) { + + // Backup + int numOldCounter = numCounter ; + UINT *oldCounters = new UINT[numOldCounter] ; + memcpy(oldCounters, counters, sizeof(UINT)*numOldCounter) ; + + numCounter = _numCounter ; + memAllocCounters() ; + + // Restore + memcpy(counters, oldCounters, sizeof(UINT)*numOldCounter) ; + delete oldCounters ; + + } else { + numCounter = _numCounter ; + memAllocCounters() ; + } + + return numCounter ; + +} + +/************************************************************************/ +/* Timer */ +/************************************************************************/ +void DS_timer::initTimer( UINT id ) { + timerStates[id] = TIMER_OFF ; +#ifdef _WIN32 + totalTicks[id].QuadPart = 0 ; +#else + totalTicks[id].tv_sec = 0; + totalTicks[id].tv_usec = 0; +#endif +} + +void DS_timer::initTimers( void ) { + for ( UINT i = 0 ; i < numTimer ; i++ ) { + initTimer(i); + } +} + +void DS_timer::onTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_ON ) + return ; +#ifdef _WIN32 + QueryPerformanceCounter(&start_ticks[id]) ; +#else + gettimeofday(&start_ticks[id], NULL); +#endif + + timerStates[id] = TIMER_ON ; +} + +void DS_timer::offTimer( UINT id ) { + if ( turnOn == false ) + return ; + + if ( timerStates[id] == TIMER_OFF ) + return ; + +#ifdef _WIN32 + QueryPerformanceCounter(&end_ticks[id]) ; + totalTicks[id].QuadPart = totalTicks[id].QuadPart + (end_ticks[id].QuadPart - start_ticks[id].QuadPart) ; +#else + gettimeofday(&end_ticks[id], NULL); + TIME_VAL period, previous; + timersub(&end_ticks[id], &start_ticks[id], &period); + previous = totalTicks[id]; + timeradd(&previous, &period, &totalTicks[id]); +#endif + + timerStates[id] = TIMER_OFF ; +} + +double DS_timer::getTimer_ms( UINT id ) { +#ifdef _WIN32 + return ((double)totalTicks[id].QuadPart/(double)ticksPerSecond.QuadPart * 1000) ; +#else + return (double)(totalTicks[id].tv_sec * 1000 + totalTicks[id].tv_usec / 1000.0); +#endif +} + +/************************************************************************/ +/* Counter */ +/************************************************************************/ +void DS_timer::incCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id]++ ; +} + +void DS_timer::initCounters( void ) { + if ( turnOn == false ) + return ; + + for ( UINT i = 0 ; i < numCounter ; i++ ) + counters[i] = 0 ; +} + +void DS_timer::initCounter( UINT id ) { + if ( turnOn == false ) + return ; + + counters[id] = 0 ; +} + +void DS_timer::add2Counter( UINT id, UINT num ) { + if ( turnOn == false ) + return ; + + counters[id] += num ; +} + +UINT DS_timer::getCounter( UINT id ) { + if ( turnOn == false ) + return 0; + + return counters[id] ; +} + +/************************************************************************/ +/* */ +/************************************************************************/ +void DS_timer::printTimer( float _denominator){ + + if ( turnOn == false ) + return ; + + //printf("\n*\t DS_timer Report \t*\n") ; + printf("\n*\t %s \t*\n", timerTitle) ; + printf("* The number of timer = %d, counter = %d\n", numTimer, numCounter ) ; + printf("**** Timer report ****\n") ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + if ( timerName[i].c_str()[0] == 0 ) + printf("Timer %d : %.5f ms\n", i, getTimer_ms(i) ) ; + else + printf("%s : %.5f ms\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + printf("**** Counter report ****\n") ; + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + printf("Counter %d : %.3f (%d) \n",i, counters[i]/_denominator, counters[i] ) ; + } + + printf("*\t End of the report \t*\n") ; +} + +void DS_timer::printToFile( char* fileName, int _id ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_report.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + if ( _id >= 0 ) + fprintf_s(fp,"%d\t", _id) ; + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( getTimer_ms(i) == 0 ) + continue ; + fprintf_s(fp, "%s: %.9f\n", timerName[i].c_str(), getTimer_ms(i) ) ; + } + + for ( UINT i = 0 ; i < numCounter ;i++ ) { + if ( counters[i] == 0 ) + continue ; + fprintf_s(fp, "%s: %d\n", timerName[i].c_str(), counters[i] ) ; + } + + fprintf_s(fp, "\n") ; + + fclose(fp) ; +} + +void DS_timer::printTimerNameToFile( char* fileName ) +{ + if ( turnOn == false ) + return ; + + FILE *fp ; + + if ( fileName == NULL) + fopen_s(&fp, "DS_timer_name.txt", "a") ; + else { + fopen_s(&fp, fileName, "a") ; + } + + + for ( UINT i = 0 ; i < numTimer ; i++ ) { + if ( timerName[i].empty() ) + continue ; + fprintf_s(fp, "%s\t", timerName[i].c_str() ) ; + } + fprintf_s(fp, "\n") ; + fclose(fp) ; +} diff --git a/Intermediate_Class/Lecture7/Stream/DS_timer.h b/Intermediate_Class/Lecture7/Stream/DS_timer.h new file mode 100644 index 0000000..d1c3391 --- /dev/null +++ b/Intermediate_Class/Lecture7/Stream/DS_timer.h @@ -0,0 +1,95 @@ +//#pragma once +#ifndef _DS_TIMER_H +#define _DS_TIMER_H + +#include // std string + +#ifndef UINT +typedef unsigned int UINT; +#endif + +#ifdef _WIN32 + // For windows + #include + typedef LARGE_INTEGER TIME_VAL; +#else + // For Unix/Linux + #include + #include + #include + #include // c string + typedef struct timeval TIME_VAL; +#endif + +#define TIMER_ON true +#define TIMER_OFF false + +class DS_timer +{ +private : + + bool turnOn ; + + UINT numTimer ; + UINT numCounter ; + + // For timers + bool* timerStates ; + TIME_VAL ticksPerSecond; + TIME_VAL *start_ticks; + TIME_VAL *end_ticks; + TIME_VAL *totalTicks; + + char timerTitle[255] ; + std::string *timerName ; + + // For counters + UINT *counters ; + + void memAllocCounters ( void ) ; + void memAllocTimers ( void ) ; + void releaseCounters ( void ) ; + void releaseTimers ( void ) ; + +public: + DS_timer(int _numTimer = 1, int _numCount = 1, bool _trunOn = true ); + ~DS_timer(void); + + // For configurations + inline void timerOn ( void ) { turnOn = TIMER_ON ; } + inline void timerOff ( void ) { turnOn = TIMER_OFF ; } + + UINT getNumTimer( void ) ; + UINT getNumCounter ( void ) ; + UINT setTimer ( UINT _numTimer ) ; + UINT setCounter ( UINT _numCounter ) ; + + // For timers + + void initTimer(UINT id) ; + void initTimers ( void ); + void onTimer(UINT id) ; + void offTimer(UINT id ) ; + double getTimer_ms(UINT id) ; + + void setTimerTitle ( char* _name ) { memset(timerTitle, 0, sizeof(char)*255) ; memcpy(timerTitle, _name, strlen(_name)) ; } + + void setTimerName (UINT id, std::string &_name) { timerName[id] = _name ; } + void setTimerName (UINT id, char* _name) { timerName[id] = _name ;} + + // For counters + + void incCounter(UINT id) ; + void initCounters( void ) ; + void initCounter(UINT id) ; + void add2Counter( UINT id, UINT num ) ; + UINT getCounter ( UINT id ) ; + + // For reports + + void printTimer ( float _denominator = 1 ) ; + void printToFile ( char* fileName, int _id = -1 ) ; + void printTimerNameToFile ( char* fileName ) ; +} ; + +#endif \ No newline at end of file diff --git a/Intermediate_Class/Lecture7/Stream/Stream.cu b/Intermediate_Class/Lecture7/Stream/Stream.cu new file mode 100644 index 0000000..3744662 --- /dev/null +++ b/Intermediate_Class/Lecture7/Stream/Stream.cu @@ -0,0 +1,110 @@ +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include "DS_timer.h" +#include +#include +#include + +#define NUM_BLOCK (128*1024) +#define NUM_T_IN_B 1024 +#define ARRAY_SIZE (NUM_T_IN_B*NUM_BLOCK) + +#define NUM_STREAMS 2 + +__global__ void myKernel(int* _in, int* _out) +{ + int tID = blockDim.x * blockIdx.x + threadIdx.x; + + int temp = 0; + for (int i = 0; i < 250; i++) { + temp = (temp + _in[tID] * 5) % 10; + } + _out[tID] = temp; + +} + +void main(void) +{ + DS_timer timer(10); + timer.setTimerName(0, "Single stream"); + timer.setTimerName(1, " * Host -> Device"); + timer.setTimerName(2, " * Kernel execution"); + timer.setTimerName(3, " * Device -> Host"); + timer.setTimerName(4, "Multiple streams"); + + int* in = NULL, * out = NULL, * out2 = NULL; + + cudaMallocHost(&in, sizeof(int) * ARRAY_SIZE); + memset(in, 0, sizeof(int) * ARRAY_SIZE); + + cudaMallocHost(&out, sizeof(int) * ARRAY_SIZE); + memset(out, 0, sizeof(int) * ARRAY_SIZE); + + cudaMallocHost(&out2, sizeof(int) * ARRAY_SIZE); + memset(out2, 0, sizeof(int) * ARRAY_SIZE); + + int* dIn, * dOut; + cudaMalloc(&dIn, sizeof(int) * ARRAY_SIZE); + cudaMalloc(&dOut, sizeof(int) * ARRAY_SIZE); + + for (int i = 0; i < ARRAY_SIZE; i++) + in[i] = rand() % 10; + + // Single stram version + timer.onTimer(0); + + timer.onTimer(1); + cudaMemcpy(dIn, in, sizeof(int) * ARRAY_SIZE, cudaMemcpyHostToDevice); + timer.offTimer(1); + + timer.onTimer(2); + myKernel << > > (dIn, dOut); + cudaDeviceSynchronize(); + timer.offTimer(2); + + timer.onTimer(3); + cudaMemcpy(out, dOut, sizeof(int) * ARRAY_SIZE, cudaMemcpyDeviceToHost); + timer.offTimer(3); + + timer.offTimer(0); + + // Multiple stream version + cudaStream_t stream[NUM_STREAMS]; + + for(int i = 0 ; i < NUM_STREAMS ; i++) + cudaStreamCreate(&stream[i]); + + timer.onTimer(4); + + int chunkSize = ARRAY_SIZE / NUM_STREAMS; + + for (int i = 0; i < NUM_STREAMS ; i++) + { + int offset = chunkSize * i; + cudaMemcpyAsync(dIn + offset, in + offset, sizeof(int) * chunkSize, cudaMemcpyHostToDevice, stream[i]); + myKernel << > > (dIn + offset, dOut + offset); + cudaMemcpyAsync(out2 + offset, dOut + offset, sizeof(int) * chunkSize, cudaMemcpyDeviceToHost, stream[i]); + } + + cudaDeviceSynchronize(); + timer.offTimer(4); + + for (int i = 0; i < ARRAY_SIZE; i++) + { + if (out[i] != out2[i]) + printf("!"); + } + + for (int i = 0; i < NUM_STREAMS; i++) + cudaStreamDestroy(stream[i]); + + timer.printTimer(); + + cudaFree(dIn); + cudaFree(dOut); + + cudaFreeHost(in); + cudaFreeHost(out); + cudaFreeHost(out2); +} \ No newline at end of file diff --git a/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj b/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj new file mode 100644 index 0000000..aa50fc6 --- /dev/null +++ b/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj @@ -0,0 +1,158 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {afd873f0-6742-40cc-a371-ffb188bc1ac1} + Stream + 10.0 + + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + true + + + false + + + true + + + false + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Document + + + + + + + + + + + + + + \ No newline at end of file diff --git a/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj.filters b/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj.filters new file mode 100644 index 0000000..9824bec --- /dev/null +++ b/VS2019/Intermediate_Class/Lecture7/Stream/Stream.vcxproj.filters @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file