diff --git a/src/client/client.cpp b/src/client/client.cpp index df004059..9be77d07 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -73,7 +73,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; - cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { @@ -204,7 +203,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } @@ -252,10 +251,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -289,7 +288,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -325,10 +324,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -373,7 +372,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -391,22 +390,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } - // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); - // Create and initialize our timer class, if the external timer shared library loaded - baseStatTimer* timer = NULL; - size_t clFFTID = 0; - if( get_timer ) - { - timer = get_timer( CLFFT_GPU ); - timer->Reserve( 1, profile_count ); - timer->setNormalize( true ); - - clFFTID = timer->getUniqueID( "clFFT", 0 ); - } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); @@ -511,37 +498,64 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride } } - // Loop as many times as the user specifies to average out the timings - // + cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; - Timer tr; - tr.Start(); + // Execute once for basic functional test + OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, NULL, + &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), + "clfftEnqueueTransform failed" ); - for( cl_uint i = 0; i < profile_count; ++i ) - { - if( timer ) timer->Start( clFFTID ); + OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); + - OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, - &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), - "clfftEnqueueTransform failed" ); + // Create and initialize our timer class, if the external timer shared library loaded + baseStatTimer* timer = NULL; + size_t clFFTID = 0; + if( get_timer ) + { + timer = get_timer( CLFFT_GPU ); + timer->Reserve( 1, profile_count ); + timer->setNormalize( true ); - if( timer ) timer->Stop( clFFTID ); + clFFTID = timer->getUniqueID( "clFFT", 0 ); } - OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); - if(clMedBuffer) clReleaseMemObject(clMedBuffer); - double wtime = tr.Sample()/((double)profile_count); - size_t totalLen = 1; - for(int i=0; i 1) { + Timer tr; + tr.Start(); + for( cl_uint i = 0; i < profile_count; ++i ) + { + if( timer ) timer->Start( clFFTID ); + + OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent[i], + &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), + "clfftEnqueueTransform failed" ); + + if( timer ) timer->Stop( clFFTID ); + } + OPENCL_V_THROW( clWaitForEvents ( profile_count, outEvent ), "clWaitForEvents failed" ); + + double wtime = tr.Sample()/((double)profile_count); + + OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); + + size_t totalLen = 1; + for(int i=0; i initializeCL( cl_device_type deviceType, int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent ) { - if( *outEvent != NULL ) - OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" ); + if(outEvent != NULL) + { + if( *outEvent != NULL ) + OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" ); + } releaseOpenCLMemBuffer( numBuffersIn, inputBuffer); releaseOpenCLMemBuffer( numBuffersOut, outputBuffer); diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp index 93d073d3..4161279b 100644 --- a/src/library/generator.stockham.cpp +++ b/src/library/generator.stockham.cpp @@ -821,6 +821,150 @@ namespace StockhamGenerator return; } + // block to rearrange reads of adjacent memory locations together + if(linearRegs && (flag == SR_READ)) + { + for(size_t r=0; r (radix/2))) + break; + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i != 0)) + break; + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0)) + passStr += "\n\t}\n\tif( rw && !me)\n\t{"; + + for(size_t c=cStart; c algLS ) + { + passStr += "(("; passStr += SztToStr(numButterfly); + passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/"; + passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + ("; + passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex); + passStr += ")%"; passStr += SztToStr(algLS); passStr += " + "; + } + else + { + passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex); + passStr += " + "; + } + + passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]"; + passStr += tail; passStr += " = "; passStr += regIndex; + if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } + passStr += ";"; + + // Since we write real & imag at once, we break the loop + if(interleaved && (component == SR_COMP_BOTH)) + break; + } + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0)) + passStr += "\n\t}\n\tif(rw)\n\t{"; + + butterflyIndex++; + } + } + + return; + } + + for(size_t i=0; iinStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc - && (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) ) + && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) ) { fftPlan->blockCompute = true; @@ -607,7 +607,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break; if ( IsPo2(fftPlan->length[0]) - && (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) ) break; + && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) ) break; if ( clLengths[0]<=32 && clLengths[1]<=32) break; diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp index 269378ec..9cce59bf 100644 --- a/src/statTimer/statisticalTimer.GPU.cpp +++ b/src/statTimer/statisticalTimer.GPU.cpp @@ -176,6 +176,7 @@ GpuStatTimer::Reset( ) if( nEvents == 0 || nSamples == 0 ) throw std::runtime_error( "StatisticalTimer::Reserve( ) was not called before Reset( )" ); + ReleaseEvents(); Reserve( nEvents, nSamples ); return; @@ -203,9 +204,17 @@ void GpuStatTimer::AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern, cl_uint numEvents, cl_event* ev, const std::vector< size_t >& gWorkSize ) { + if( (numEvents != 0) && (ev == NULL) ) + return; + if( timerData.empty( ) ) return; + for( size_t i = 0; i < numEvents; ++i ) + { + ::clRetainEvent(ev[i]); + } + if( currRecord == 0 ) { timerData.at( currID ).push_back( StatDataVec( ) ); @@ -242,6 +251,26 @@ GpuStatTimer::getUniqueID( const std::string& label, cl_uint groupID ) } +void GpuStatTimer::ReleaseEvents() +{ + for( cl_uint id = 0; id < labelID.size( ); ++id ) + { + for( size_t s = 0; s < timerData.at( id ).size( ); ++s ) + { + for( size_t n = 0; n < timerData.at( id ).at( s ).size( ); ++n ) + { + StatData& sd = timerData[ id ][ s ][ n ]; + + for( size_t i = 0; i < sd.outEvents.size( ); ++i ) + { + ::clReleaseEvent(sd.outEvents[ i ]); + } + + } + } + } +} + void GpuStatTimer::queryOpenCL( size_t id ) { for( size_t s = 0; s < timerData.at( id ).size( ); ++s ) @@ -448,7 +477,9 @@ GpuStatTimer::Print( ) mean[ m ].plHandle == mean[ t ].planZ || mean[ m ].plHandle == mean[ t ].planTX || mean[ m ].plHandle == mean[ t ].planTY || - mean[ m ].plHandle == mean[ t ].planTZ ) + mean[ m ].plHandle == mean[ t ].planTZ || + mean[ m ].plHandle == mean[ t ].planRCcopy || + mean[ m ].plHandle == mean[ t ].planCopy ) { time += mean[ m ].doubleNanoSec; } @@ -471,7 +502,8 @@ GpuStatTimer::Print( ) } if( ( mean[ t ].planX + mean[ t ].planY + mean[ t ].planZ ) > 0 || - ( mean[ t ].planTX + mean[ t ].planTY + mean[ t ].planTZ ) > 0 ) + ( mean[ t ].planTX + mean[ t ].planTY + mean[ t ].planTZ ) > 0 || + ( mean[ t ].planRCcopy + mean[ t ].planCopy ) > 0 ) { tout << std::setw( tableFourth ) << _T( "Child Handles:" ); catLengths.str( _T( "" ) ); @@ -503,6 +535,16 @@ GpuStatTimer::Print( ) catLengths << _T( "," ); catLengths << mean[ t ].planTZ; } + if( mean[ t ].planRCcopy != 0 ) + { + catLengths << _T( "," ); + catLengths << mean[ t ].planRCcopy; + } + if( mean[ t ].planCopy != 0 ) + { + catLengths << _T( "," ); + catLengths << mean[ t ].planCopy; + } catLengths << _T( ")" ); tout << std::setw( tableThird ) << catLengths.str( ) << std::endl; } diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h index 62e3c29a..d52e7ddc 100644 --- a/src/statTimer/statisticalTimer.GPU.h +++ b/src/statTimer/statisticalTimer.GPU.h @@ -48,6 +48,9 @@ struct StatData clfftPlanHandle planTY; clfftPlanHandle planTZ; + clfftPlanHandle planRCcopy; + clfftPlanHandle planCopy; + std::vector< size_t > lengths; std::vector< size_t > inStride; std::vector< size_t > outStride; @@ -62,6 +65,7 @@ struct StatData deltaNanoSec( 0 ), kernel( kern ), batchSize( plan->batchsize ), dim( plan->dim ), plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ), planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ), + planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ), inStride( plan->inStride ), outStride( plan->outStride ), lengths( plan->length ), enqueueWorkSize( gWorkSize ) { @@ -182,6 +186,8 @@ class GpuStatTimer : public baseStatTimer void queryOpenCL( size_t id ); + void ReleaseEvents(); + public: /** * \fn getInstance() diff --git a/src/tests/accuracy_test_pow3.cpp b/src/tests/accuracy_test_pow3.cpp index 844e2158..cf7777e8 100644 --- a/src/tests/accuracy_test_pow3.cpp +++ b/src/tests/accuracy_test_pow3.cpp @@ -48,7 +48,7 @@ class accuracy_test_pow3_double : public ::testing::Test { } }; -namespace power2 +namespace power3 { // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // // ^^^^^^^^^^^^^^^^^^^^^^^ normal 1D ^^^^^^^^^^^^^^^^^^^^^^ // diff --git a/src/tests/accuracy_test_pow5.cpp b/src/tests/accuracy_test_pow5.cpp index c73f5244..0861c763 100644 --- a/src/tests/accuracy_test_pow5.cpp +++ b/src/tests/accuracy_test_pow5.cpp @@ -48,7 +48,7 @@ class accuracy_test_pow5_double : public ::testing::Test { } }; -namespace power2 +namespace power5 { // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // // ^^^^^^^^^^^^^^^^^^^^^^^ normal 1D ^^^^^^^^^^^^^^^^^^^^^^ //