From bb880d95af73927f3b7a692785ffacaeb73fb398 Mon Sep 17 00:00:00 2001 From: bnataraj Date: Fri, 31 Jul 2015 01:35:47 -0500 Subject: [PATCH 1/9] changes to fix ECC on performance issue --- src/library/generator.stockham.cpp | 72 +++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp index 93d073d3..d145234e 100644 --- a/src/library/generator.stockham.cpp +++ b/src/library/generator.stockham.cpp @@ -821,6 +821,62 @@ namespace StockhamGenerator return; } + // block to rearrange reads of adjacent memory locations together + if(linearRegs && (flag == SR_READ)) + { + for(size_t r=0; r Date: Wed, 5 Aug 2015 15:53:40 -0500 Subject: [PATCH 2/9] fixing ECC issue in inverse C2R tranform; pow2 only --- src/library/generator.stockham.cpp | 95 ++++++++++++++++++++++++++++++ src/tests/accuracy_test_pow3.cpp | 2 +- src/tests/accuracy_test_pow5.cpp | 2 +- 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp index d145234e..4f39e0c7 100644 --- a/src/library/generator.stockham.cpp +++ b/src/library/generator.stockham.cpp @@ -877,6 +877,94 @@ namespace StockhamGenerator } return; } + + // block to rearrange writes of adjacent memory locations together + if(linearRegs && (flag == SR_WRITE) && (nextPass == NULL)) + { + for(size_t r=0; r (radix/2))) + break; + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i != 0)) + break; + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0)) + passStr += "\n\t}\n\tif( rw && !me)\n\t{"; + + for(size_t c=cStart; c algLS ) + { + passStr += "(("; passStr += SztToStr(numButterfly); + passStr += "*me + "; passStr += SztToStr(butterflyIndex); passStr += ")/"; + passStr += SztToStr(algLS); passStr += ")*"; passStr += SztToStr(algL); passStr += " + ("; + passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex); + passStr += ")%"; passStr += SztToStr(algLS); passStr += " + "; + } + else + { + passStr += SztToStr(numButterfly); passStr += "*me + "; passStr += SztToStr(butterflyIndex); + passStr += " + "; + } + + passStr += SztToStr(r*algLS); passStr += " )*"; passStr += SztToStr(stride); passStr += "]"; + passStr += tail; passStr += " = "; passStr += regIndex; + if(scale != 1.0f) { passStr += " * "; passStr += FloatToStr(scale); passStr += FloatSuffix(); } + passStr += ";"; + + // Since we write real & imag at once, we break the loop + if(interleaved && (component == SR_COMP_BOTH)) + break; + } + + if(realSpecial && (nextPass == NULL) && (r == radix/2) && (i == 0)) + passStr += "\n\t}\n\tif(rw)\n\t{"; + + butterflyIndex++; + } + } + + return; + } + + for(size_t i=0; i Date: Wed, 5 Aug 2015 19:42:06 -0500 Subject: [PATCH 3/9] fixing event leak issues that caused seg fault of client with 14.502 driver, this problem did not affect standalone library usage --- src/client/client.cpp | 26 ++++++++++++++--------- src/client/openCL.misc.cpp | 7 +++++-- src/statTimer/statisticalTimer.GPU.cpp | 29 ++++++++++++++++++++++++++ src/statTimer/statisticalTimer.GPU.h | 2 ++ 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/client/client.cpp b/src/client/client.cpp index df004059..1dfb5ed5 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -73,7 +73,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; - cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { @@ -204,7 +203,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } @@ -252,10 +251,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -289,7 +288,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -325,10 +324,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -373,7 +372,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], - 0, NULL, &outEvent ), + 0, NULL, NULL ), "clEnqueueWriteBuffer failed" ); } break; @@ -518,11 +517,13 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride Timer tr; tr.Start(); + cl_event *outEvent = new cl_event[profile_count]; + for( cl_uint i = 0; i < profile_count; ++i ) { if( timer ) timer->Start( clFFTID ); - OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, + OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent[i], &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); @@ -553,6 +554,11 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride /*****************/ FreeSharedLibrary( timerLibHandle ); + for( cl_uint i = 0; i < profile_count; ++i ) + clReleaseEvent(outEvent[i]); + + delete[] outEvent; + // Read and check output data // This check is not valid if the FFT is executed multiple times inplace. // @@ -725,7 +731,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" ); OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" ); - cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent ); + cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, NULL ); return 0; } diff --git a/src/client/openCL.misc.cpp b/src/client/openCL.misc.cpp index cb5db296..21d4cbc2 100644 --- a/src/client/openCL.misc.cpp +++ b/src/client/openCL.misc.cpp @@ -477,8 +477,11 @@ std::vector< cl_device_id > initializeCL( cl_device_type deviceType, int cleanupCL( cl_context* context, cl_command_queue* commandQueue, const cl_uint numBuffersIn, cl_mem inputBuffer[], const cl_uint numBuffersOut, cl_mem outputBuffer[], cl_event* outEvent ) { - if( *outEvent != NULL ) - OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" ); + if(outEvent != NULL) + { + if( *outEvent != NULL ) + OPENCL_V_THROW( clReleaseEvent( *outEvent ), "Error: In clReleaseEvent\n" ); + } releaseOpenCLMemBuffer( numBuffersIn, inputBuffer); releaseOpenCLMemBuffer( numBuffersOut, outputBuffer); diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp index 269378ec..bdd3bd45 100644 --- a/src/statTimer/statisticalTimer.GPU.cpp +++ b/src/statTimer/statisticalTimer.GPU.cpp @@ -176,6 +176,7 @@ GpuStatTimer::Reset( ) if( nEvents == 0 || nSamples == 0 ) throw std::runtime_error( "StatisticalTimer::Reserve( ) was not called before Reset( )" ); + ReleaseEvents(); Reserve( nEvents, nSamples ); return; @@ -203,9 +204,17 @@ void GpuStatTimer::AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern, cl_uint numEvents, cl_event* ev, const std::vector< size_t >& gWorkSize ) { + if(ev == NULL) + return; + if( timerData.empty( ) ) return; + for( size_t i = 0; i < numEvents; ++i ) + { + ::clRetainEvent(ev[i]); + } + if( currRecord == 0 ) { timerData.at( currID ).push_back( StatDataVec( ) ); @@ -242,6 +251,26 @@ GpuStatTimer::getUniqueID( const std::string& label, cl_uint groupID ) } +void GpuStatTimer::ReleaseEvents() +{ + for( cl_uint id = 0; id < labelID.size( ); ++id ) + { + for( size_t s = 0; s < timerData.at( id ).size( ); ++s ) + { + for( size_t n = 0; n < timerData.at( id ).at( s ).size( ); ++n ) + { + StatData& sd = timerData[ id ][ s ][ n ]; + + for( size_t i = 0; i < sd.outEvents.size( ); ++i ) + { + ::clReleaseEvent(sd.outEvents[ i ]); + } + + } + } + } +} + void GpuStatTimer::queryOpenCL( size_t id ) { for( size_t s = 0; s < timerData.at( id ).size( ); ++s ) diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h index 62e3c29a..8a099561 100644 --- a/src/statTimer/statisticalTimer.GPU.h +++ b/src/statTimer/statisticalTimer.GPU.h @@ -182,6 +182,8 @@ class GpuStatTimer : public baseStatTimer void queryOpenCL( size_t id ); + void ReleaseEvents(); + public: /** * \fn getInstance() From cf1640316ae5afdde81d48af71e985ad08239ec6 Mon Sep 17 00:00:00 2001 From: bnataraj Date: Thu, 6 Aug 2015 15:04:04 -0500 Subject: [PATCH 4/9] minor fixes to event handling from previous checkin --- src/client/client.cpp | 6 +++++- src/statTimer/statisticalTimer.GPU.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/client/client.cpp b/src/client/client.cpp index 1dfb5ed5..16808223 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -521,6 +521,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride for( cl_uint i = 0; i < profile_count; ++i ) { + outEvent[i] = 0; if( timer ) timer->Start( clFFTID ); OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent[i], @@ -555,7 +556,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride FreeSharedLibrary( timerLibHandle ); for( cl_uint i = 0; i < profile_count; ++i ) - clReleaseEvent(outEvent[i]); + { + if(outEvent[i]) + clReleaseEvent(outEvent[i]); + } delete[] outEvent; diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp index bdd3bd45..31fe146c 100644 --- a/src/statTimer/statisticalTimer.GPU.cpp +++ b/src/statTimer/statisticalTimer.GPU.cpp @@ -204,7 +204,7 @@ void GpuStatTimer::AddSample( clfftPlanHandle plHandle, FFTPlan* plan, cl_kernel kern, cl_uint numEvents, cl_event* ev, const std::vector< size_t >& gWorkSize ) { - if(ev == NULL) + if( (numEvents != 0) && (ev == NULL) ) return; if( timerData.empty( ) ) From a67ea0aac3eb188eaa785c109f23b7e99b76bd5a Mon Sep 17 00:00:00 2001 From: bragadeesh Date: Thu, 6 Aug 2015 15:20:10 -0500 Subject: [PATCH 5/9] optimizing EnqueueTransform API timing measurement --- src/client/client.cpp | 47 ++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/client/client.cpp b/src/client/client.cpp index 16808223..795cacbd 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -514,36 +514,47 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride // cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; - Timer tr; - tr.Start(); + OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, NULL, + &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), + "clfftEnqueueTransform failed" ); + + OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); cl_event *outEvent = new cl_event[profile_count]; + for( cl_uint i = 0; i < profile_count; ++i ) outEvent[i] = 0; - for( cl_uint i = 0; i < profile_count; ++i ) + if(profile_count > 1) { - outEvent[i] = 0; - if( timer ) timer->Start( clFFTID ); + Timer tr; + tr.Start(); + for( cl_uint i = 0; i < profile_count; ++i ) + { + if( timer ) timer->Start( clFFTID ); - OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent[i], - &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), - "clfftEnqueueTransform failed" ); + OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent[i], + &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), + "clfftEnqueueTransform failed" ); - if( timer ) timer->Stop( clFFTID ); - } - OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); - if(clMedBuffer) clReleaseMemObject(clMedBuffer); + if( timer ) timer->Stop( clFFTID ); + } + OPENCL_V_THROW( clWaitForEvents ( profile_count, outEvent ), "clWaitForEvents failed" ); + + double wtime = tr.Sample()/((double)profile_count); + + OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); + + size_t totalLen = 1; + for(int i=0; i 1) - { tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl; tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl; + } + if(clMedBuffer) clReleaseMemObject(clMedBuffer); + if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) ) { // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result From 9cbda54df4da4177f1926865113696b121dba755 Mon Sep 17 00:00:00 2001 From: bragadeesh Date: Thu, 6 Aug 2015 15:55:12 -0500 Subject: [PATCH 6/9] fixing duplicate timing displays --- src/client/client.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/client/client.cpp b/src/client/client.cpp index 795cacbd..9be77d07 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -390,22 +390,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } - // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); - // Create and initialize our timer class, if the external timer shared library loaded - baseStatTimer* timer = NULL; - size_t clFFTID = 0; - if( get_timer ) - { - timer = get_timer( CLFFT_GPU ); - timer->Reserve( 1, profile_count ); - timer->setNormalize( true ); - - clFFTID = timer->getUniqueID( "clFFT", 0 ); - } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); @@ -510,15 +498,28 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride } } - // Loop as many times as the user specifies to average out the timings - // + cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; + // Execute once for basic functional test OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, NULL, &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); + + + // Create and initialize our timer class, if the external timer shared library loaded + baseStatTimer* timer = NULL; + size_t clFFTID = 0; + if( get_timer ) + { + timer = get_timer( CLFFT_GPU ); + timer->Reserve( 1, profile_count ); + timer->setNormalize( true ); + + clFFTID = timer->getUniqueID( "clFFT", 0 ); + } cl_event *outEvent = new cl_event[profile_count]; for( cl_uint i = 0; i < profile_count; ++i ) outEvent[i] = 0; From 6ee3759962afa1281413e6d7ab246d91afc11e44 Mon Sep 17 00:00:00 2001 From: bragadeesh Date: Thu, 6 Aug 2015 17:32:04 -0500 Subject: [PATCH 7/9] adding missing plans in timer --- src/statTimer/statisticalTimer.GPU.cpp | 17 +++++++++++++++-- src/statTimer/statisticalTimer.GPU.h | 4 ++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp index 31fe146c..9cce59bf 100644 --- a/src/statTimer/statisticalTimer.GPU.cpp +++ b/src/statTimer/statisticalTimer.GPU.cpp @@ -477,7 +477,9 @@ GpuStatTimer::Print( ) mean[ m ].plHandle == mean[ t ].planZ || mean[ m ].plHandle == mean[ t ].planTX || mean[ m ].plHandle == mean[ t ].planTY || - mean[ m ].plHandle == mean[ t ].planTZ ) + mean[ m ].plHandle == mean[ t ].planTZ || + mean[ m ].plHandle == mean[ t ].planRCcopy || + mean[ m ].plHandle == mean[ t ].planCopy ) { time += mean[ m ].doubleNanoSec; } @@ -500,7 +502,8 @@ GpuStatTimer::Print( ) } if( ( mean[ t ].planX + mean[ t ].planY + mean[ t ].planZ ) > 0 || - ( mean[ t ].planTX + mean[ t ].planTY + mean[ t ].planTZ ) > 0 ) + ( mean[ t ].planTX + mean[ t ].planTY + mean[ t ].planTZ ) > 0 || + ( mean[ t ].planRCcopy + mean[ t ].planCopy ) > 0 ) { tout << std::setw( tableFourth ) << _T( "Child Handles:" ); catLengths.str( _T( "" ) ); @@ -532,6 +535,16 @@ GpuStatTimer::Print( ) catLengths << _T( "," ); catLengths << mean[ t ].planTZ; } + if( mean[ t ].planRCcopy != 0 ) + { + catLengths << _T( "," ); + catLengths << mean[ t ].planRCcopy; + } + if( mean[ t ].planCopy != 0 ) + { + catLengths << _T( "," ); + catLengths << mean[ t ].planCopy; + } catLengths << _T( ")" ); tout << std::setw( tableThird ) << catLengths.str( ) << std::endl; } diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h index 8a099561..d52e7ddc 100644 --- a/src/statTimer/statisticalTimer.GPU.h +++ b/src/statTimer/statisticalTimer.GPU.h @@ -48,6 +48,9 @@ struct StatData clfftPlanHandle planTY; clfftPlanHandle planTZ; + clfftPlanHandle planRCcopy; + clfftPlanHandle planCopy; + std::vector< size_t > lengths; std::vector< size_t > inStride; std::vector< size_t > outStride; @@ -62,6 +65,7 @@ struct StatData deltaNanoSec( 0 ), kernel( kern ), batchSize( plan->batchsize ), dim( plan->dim ), plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ), planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ), + planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ), inStride( plan->inStride ), outStride( plan->outStride ), lengths( plan->length ), enqueueWorkSize( gWorkSize ) { From dc663b6a64b50394b6764e601e3a42bb18fba0d7 Mon Sep 17 00:00:00 2001 From: bragadeesh Date: Thu, 6 Aug 2015 19:06:32 -0500 Subject: [PATCH 8/9] changing algorithm for 2^19 and 2^20 sizes --- src/library/plan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/library/plan.cpp b/src/library/plan.cpp index 8a2b4ec9..084cf724 100644 --- a/src/library/plan.cpp +++ b/src/library/plan.cpp @@ -505,7 +505,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma { // Enable block compute under these conditions if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc - && (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) ) + && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) ) { fftPlan->blockCompute = true; @@ -607,7 +607,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break; if ( IsPo2(fftPlan->length[0]) - && (fftPlan->length[0] <= 1048576/PrecisionWidth(fftPlan->precision)) ) break; + && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) ) break; if ( clLengths[0]<=32 && clLengths[1]<=32) break; From 81be7cddefa8d5c049712fbcdfbe5a1cc09f63f0 Mon Sep 17 00:00:00 2001 From: bragadeesh Date: Fri, 7 Aug 2015 10:26:14 -0500 Subject: [PATCH 9/9] bug fix for length 1 --- src/library/generator.stockham.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp index 4f39e0c7..4161279b 100644 --- a/src/library/generator.stockham.cpp +++ b/src/library/generator.stockham.cpp @@ -1347,7 +1347,7 @@ namespace StockhamGenerator std::string oddpadd = oddp ? " (me/2) + " : " "; std::string idxStr, idxStrRev; - if((length == 2) || ((length & (length - 1)) != 0)) + if((length <= 2) || ((length & (length - 1)) != 0)) { idxStr += SztToStr(bid); idxStr += "*me +"; idxStr += oddpadd; idxStr += SztToStr(lid); } @@ -1468,7 +1468,7 @@ namespace StockhamGenerator if(fwd) { std::string idxStr, idxStrRev; - if((length == 2) || ((length & (length - 1)) != 0)) + if((length <= 2) || ((length & (length - 1)) != 0)) { idxStr += SztToStr(length/(2*workGroupSize)); idxStr += "*me +"; idxStr += oddpadd; idxStr += SztToStr(lid); } @@ -1541,7 +1541,7 @@ namespace StockhamGenerator else { std::string idxStr, idxStrRev; - if((length == 2) || ((length & (length - 1)) != 0)) + if((length <= 2) || ((length & (length - 1)) != 0)) { idxStr += SztToStr(bid); idxStr += "*me +"; idxStr += oddpadd; idxStr += SztToStr(lid); }