CUDA计算直方图,核函数启动二维线程块、线程格,但是只能计算size大小为16的倍数的data
#include "../common/book.h"
#define M 10240
#define N 10240
#define SIZE (M*N)
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
getchar();
exit(-1);
}
}
__global__ void myhistKernel(unsigned char * buffer,unsigned int * histo)
{
__shared__ unsigned int temp[256];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int offset = x + y * blockDim.x * gridDim.x;
int linear = threadIdx.x + threadIdx.y * blockDim.x;
if(linear<256)
temp[linear] = 0;
__syncthreads();
if(x<M && y<N)
atomicAdd( &temp[buffer[offset]], 1 );
__syncthreads();
if(linear<256)
atomicAdd(&histo[linear], temp[linear]);
}
int main( void ) {
unsigned char *buffer =(unsigned char*)big_random_block( SIZE );
// capture the start time
// starting the timer here so that we include the cost of
// all of the operations on the GPU. if the data were
// already on the GPU and we just timed the kernel
// the timing would drop from 74 ms to 15 ms. Very fast.
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// allocate memory on the GPU for the file's data
unsigned char *dev_buffer;
unsigned int *dev_histo;
HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
256 * sizeof( int ) ) );
HANDLE_ERROR( cudaMemset( dev_histo, 0,
256 * sizeof( int ) ) );
dim3 dimBlock(16,16);
dim3 dimGrid((10240+dimBlock.x-1)/(dimBlock.x),(10240+dimBlock.y-1)/(dimBlock.y));
myhistKernel<<<dimGrid,dimBlock>>>(dev_buffer,dev_histo);
cudaThreadSynchronize();
checkCUDAError("kernel execution");
unsigned int histo[256];
HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
256 * sizeof( int ),
cudaMemcpyDeviceToHost ) );
// get stop time, and display the timing results
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
float elapsedTime;
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,start, stop ) );
printf( "Time to generate: %3.1f ms\n", elapsedTime );
long histoCount = 0;
for (int i=0; i<256; i++) {
histoCount += histo[i];
}
printf( "Histogram Sum: %ld\n", histoCount );
// verify that we have the same counts via CPU
for (int i=0; i<SIZE; i++)
histo[buffer[i]]--;
for (int i=0; i<256; i++) {
if (histo[i] != 0)
printf( "Failure at %d!\n", i );
}
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
cudaFree( dev_histo );
cudaFree( dev_buffer );
free( buffer );
getchar();
return 0;
}
如果data不是16的倍数,buffer[offset]将出现内存异常,哪位朋友知道采取什么样的方法可以解决这个问题,谢谢