Skip to content

Commit

Permalink
update Lec8
Browse files Browse the repository at this point in the history
  • Loading branch information
bluekds committed Oct 14, 2021
1 parent e9fcf70 commit f2f83b6
Show file tree
Hide file tree
Showing 15 changed files with 1,299 additions and 0 deletions.
33 changes: 33 additions & 0 deletions CUDA_Programming.sln
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Stream", "VS2019\Intermedia
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Lecture8", "Lecture8", "{40E3E314-3859-4C01-BCD7-331540F33D08}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DeviceQuery", "VS2019\Intermediate_Class\Lecture8\DeviceQuery\DeviceQuery.vcxproj", "{6A292A04-DE61-425A-832A-81BAD9E73C6C}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CUDA_Event", "VS2019\Intermediate_Class\Lecture8\CUDA_Event\CUDA_Event.vcxproj", "{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HetergeneousComputing", "VS2019\Intermediate_Class\Lecture8\HetergeneousComputing\HetergeneousComputing.vcxproj", "{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Expand Down Expand Up @@ -175,6 +181,30 @@ Global
{AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x64.Build.0 = Release|x64
{AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x86.ActiveCfg = Release|Win32
{AFD873F0-6742-40CC-A371-FFB188BC1AC1}.Release|x86.Build.0 = Release|Win32
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Debug|x64.ActiveCfg = Debug|x64
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Debug|x64.Build.0 = Debug|x64
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Debug|x86.ActiveCfg = Debug|Win32
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Debug|x86.Build.0 = Debug|Win32
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Release|x64.ActiveCfg = Release|x64
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Release|x64.Build.0 = Release|x64
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Release|x86.ActiveCfg = Release|Win32
{6A292A04-DE61-425A-832A-81BAD9E73C6C}.Release|x86.Build.0 = Release|Win32
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Debug|x64.ActiveCfg = Debug|x64
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Debug|x64.Build.0 = Debug|x64
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Debug|x86.ActiveCfg = Debug|Win32
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Debug|x86.Build.0 = Debug|Win32
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Release|x64.ActiveCfg = Release|x64
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Release|x64.Build.0 = Release|x64
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Release|x86.ActiveCfg = Release|Win32
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C}.Release|x86.Build.0 = Release|Win32
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Debug|x64.ActiveCfg = Debug|x64
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Debug|x64.Build.0 = Debug|x64
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Debug|x86.ActiveCfg = Debug|Win32
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Debug|x86.Build.0 = Debug|Win32
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Release|x64.ActiveCfg = Release|x64
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Release|x64.Build.0 = Release|x64
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Release|x86.ActiveCfg = Release|Win32
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -207,6 +237,9 @@ Global
{8C2E78EF-0AF9-4AD6-B1E3-2A3BB7DB2798} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1}
{AFD873F0-6742-40CC-A371-FFB188BC1AC1} = {8C2E78EF-0AF9-4AD6-B1E3-2A3BB7DB2798}
{40E3E314-3859-4C01-BCD7-331540F33D08} = {B3430C43-00B8-443C-A7C2-DCF639AEADD1}
{6A292A04-DE61-425A-832A-81BAD9E73C6C} = {40E3E314-3859-4C01-BCD7-331540F33D08}
{2E2C2276-BEE7-4C17-9A9F-7B7D1426CE9C} = {40E3E314-3859-4C01-BCD7-331540F33D08}
{0DF2DE5C-951E-458B-B8C3-137B7FF0D43F} = {40E3E314-3859-4C01-BCD7-331540F33D08}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {52DE253B-6715-4A93-BA27-7131973681F9}
Expand Down
100 changes: 100 additions & 0 deletions Intermediate_Class/Lecture8/CUDA_Event/CUDA_Event.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "DS_timer.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define ARRAY_SIZE (64*1024*1024)
#define B_SIZE (1024)

#define NUM_STREAMS 1

__global__ void myKernel(int* _in, int* _out)
{
int tID = blockDim.x * blockIdx.x + threadIdx.x;

int temp = 0;
for (int i = 0; i < 250; i++) {
temp = (temp + _in[tID] * 5) % 10;
}
_out[tID] = temp;

}

void main(void)
{
DS_timer timer(1);
timer.setTimerName(0, "Total");

int* in = NULL, * out = NULL, * out2 = NULL;

cudaMallocHost(&in, sizeof(int) * ARRAY_SIZE);
memset(in, 0, sizeof(int) * ARRAY_SIZE);

cudaMallocHost(&out, sizeof(int) * ARRAY_SIZE);
memset(out, 0, sizeof(int) * ARRAY_SIZE);

cudaMallocHost(&out2, sizeof(int) * ARRAY_SIZE);
memset(out2, 0, sizeof(int) * ARRAY_SIZE);

int* dIn, * dOut;
cudaMalloc(&dIn, sizeof(int) * ARRAY_SIZE);
cudaMalloc(&dOut, sizeof(int) * ARRAY_SIZE);

for (int i = 0; i < ARRAY_SIZE; i++)
in[i] = rand() % 10;

// Multiple stream version
cudaStream_t stream[NUM_STREAMS];
cudaEvent_t start[NUM_STREAMS], end[NUM_STREAMS];

for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamCreate(&stream[i]);
cudaEventCreate(&start[i]); cudaEventCreate(&end[i]);
}

int chunkSize = ARRAY_SIZE / NUM_STREAMS;

timer.onTimer(0);
for (int i = 0; i < NUM_STREAMS; i++)
{
int offset = chunkSize * i;
cudaEventRecord(start[i], stream[i]);

cudaMemcpyAsync(dIn + offset, in + offset, sizeof(int) * chunkSize, cudaMemcpyHostToDevice, stream[i]);
myKernel <<<chunkSize/B_SIZE, B_SIZE, 0, stream[i] >>> (dIn + offset, dOut + offset);
cudaMemcpyAsync(out2 + offset, dOut + offset, sizeof(int) * chunkSize, cudaMemcpyDeviceToHost, stream[i]);

cudaEventRecord(end[i], stream[i]);
}

cudaDeviceSynchronize();
timer.offTimer(0);
timer.printTimer();

for (int i = 0; i < NUM_STREAMS; i++) {
float time = 0;
cudaEventElapsedTime(&time, start[i], end[i]);
printf("Stream[%d] : %f ms\n", i, time);
}

for (int i = 0; i < ARRAY_SIZE; i++) {
if (out[i] != out2[i])
printf("!");
}

for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamDestroy(stream[i]);
cudaEventDestroy(start[i]);
cudaEventDestroy(end[i]);
}

cudaFree(dIn);
cudaFree(dOut);

cudaFreeHost(in);
cudaFreeHost(out);
cudaFreeHost(out2);
}
197 changes: 197 additions & 0 deletions Intermediate_Class/Lecture8/CUDA_Event/DS_definitions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#pragma once

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>

#define OS_WINDOWS 0
#define OS_LINUX 1

#ifdef _WIN32
#define _TARGET_OS OS_WINDOWS
#else
#ifndef nullptr
#define nullptr NULL
#endif
#define _TARGET_OS OS_LINUX
#endif

/************************************************************************/
/* OS dependet function */
/************************************************************************/
#if _TARGET_OS == OS_WINDOWS
// #define _SPRINT sprintf_s
#define _STRTOK strtok_s

#define EXIT_WIHT_KEYPRESS {std::cout << "Press any key to exit..."; getchar(); exit(0);}

#define SPLIT_PATH(_path,_result) \
_splitpath_s(_path, _result.drive, 255, _result.dir, 255, _result.filename, 255, _result.ext, 255)


#elif _TARGET_OS == OS_LINUX
#include <libgen.h>
#include <inttypes.h>

#define _STRTOK strtok_r

#define EXIT_WIHT_KEYPRESS {std::cout << "Program was terminated!"; exit(0);}

#define sprintf_s sprintf
#define scanf_s scanf
#define fprintf_s fprintf

#define __int64 int64_t

#define fopen_s(fp, name, mode) (*fp = fopen(name, mode))

#endif

/************************************************************************/
/* Defines */
/************************************************************************/

// *********** data size
#define _1K_ 1024
#define _1M_ (_1K_*_1K_)
#define _1G_ (_1M_*_1K_)

#define CHAR_STRING_SIZE 255

/************************************************************************/
/* Type definitions */
/************************************************************************/
typedef unsigned int UINT ;

/************************************************************************/
/* Macro functions */
/************************************************************************/
#define DS_MEM_DELETE(a) \
if (a != NULL) { \
delete a ; \
a = NULL ; \
}

#define DS_MEM_DELETE_ARRAY(a) \
if (a != NULL) { \
delete [] a ; \
a = NULL ; \
}

#define RANGE_MIN 0
#define RANGE_MAX 1

#define MATCHED_STRING 0

#ifndef VTK_RANGE_MIN
#define VTK_RANGE_MIN 0
#define VTK_RANGE_MAX 1
#endif

// Print
#define PRINT_LINE_INFO printf("%s, line %d", __FILE__, __LINE__)
#define PRINT_ERROR_MSG(_msg) {PRINT_LINE_INFO; printf(" at "); printf(_msg);}

// Single loops
#define LOOP_I(a) for(int i=0; i<a; i++)
#define LOOP_J(a) for(int j=0; j<a; j++)
#define LOOP_K(a) for(int k=0; k<a; k++)
#define LOOP_INDEX(index, end) for (int index = 0 ; index < end ; index++)
#define LOOP_INDEX_START_END(index, start, end) for (int index = start ; index < end ; index++)

// Multiple loops
#define LOOP_J_I(b, a) LOOP_J(b) LOOP_I(a)
#define LOOP_K_J_I(c,b,a) for(int k=0; k<c; k++) LOOP_J_I(b,a)

//
#ifndef SWAP
template<class T>
void SWAP(T &a, T &b){
T tmp = a;
a = b;
b = tmp;
}
#endif

//
#ifndef MIN
#define MIN(a,b) (a > b ? b : a)
#endif

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

// Index converter

#define INDEX2X(_ID,_W) (_ID%_W)
#define INDEX2Y(_ID,_W) (_ID/_W)
#define INDEX2ID(_ID,_X,_Y,_W) {_X=INDEX2X(_ID,_W);_Y=INDEX2Y(_ID_,_W);}
#define ID2INDEX(_W,_X,_Y) (_Y*_W+_X)
#define PTR2ID(_type, _target, _base) ((_type*)_target - (_type*)_base)

// Memory allocation and release
#ifndef SAFE_DELETE
#define SAFE_DELETE(p) {if(p!=NULL) delete p; p=NULL;}
#endif

#ifndef SAFE_DELETE_ARR
#define SAFE_DELETE_ARR(p) {if(p!=NULL) delete [] p; p=NULL;}
#endif

#define SAFE_NEW(p, type, size) {\
try {p = new type[size];} \
catch(std::bad_alloc& exc) \
{ printf("[%s, line %d] fail to memory allocation - %.2f MB requested\n", __FILE__, __LINE__, (float)(sizeof(type)*size)/_1M_); \
EXIT_WIHT_KEYPRESS }\
}

template<class T>
void memsetZero(T** p, long long size = 0) {
if (*p != NULL)
memset(*p, 0, sizeof(T)*size);
}

template<class T>
void allocNinitMem(T** p, long long size, double *memUsage = NULL) {
*p = new T[size];
//SAFE_NEW(*p, T, size);
memset(*p, 0, sizeof(T)*size);

if (memUsage != NULL) {
*memUsage += sizeof(T)*size;
}
}

#define SAFE_MEMCPY(_dst, _src, _type, _size){ \
if(_dst == nullptr || _src == nullptr ) \
printf("[%s, line %d] fail to memcpy (dst = %x, src = %x)\n", __FILE__, __LINE__, _dst, _src); \
exit(-1); \
memcpy(_dst, _src, sizeof(_type)*_size);\
}

// VTK related
#ifndef SAFE_DELETE_VTK
#define SAFE_DELETE_VTK(p) {if(p!=NULL) p->Delete(); p=NULL;}
#endif

#ifndef VTK_IS_NOERROR
//#include "DS_common_def.h"
#define VTK_IS_NOERROR(p) (p->GetErrorCode()==vtkErrorCode::NoError ? true : false)
#endif

/************************************************************************/
/* Data structures */
/************************************************************************/
typedef struct {
std::string input;
std::string output;
} nameMatch;

typedef struct {
char drive[255];
char dir[255];
char filename[255];
char ext[255];
} filePathSplit;
Loading

0 comments on commit f2f83b6

Please sign in to comment.