看不到bank conflict,求指教
土豆南瓜粥 2014-04-01 08:11:05 #include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h>
#include <stdio.h>
#include <assert.h>
//void checkCUDAError(const char *msg);
#define TILE_DIM 16
__global__ void transposeCoalesced(float *idata, float *odata,int width, int height)
{
// __shared__ float tile[TILE_DIM][TILE_DIM+1];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + width * yIndex;
int index_out = yIndex + height * xIndex;
odata[index_out]=idata[index_in];
//odata[index_out]=2;
/* xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
tile[threadIdx.y][threadIdx.x] = idata[index_in];
__syncthreads();
odata[index_out] = tile[threadIdx.x][threadIdx.y];
*/
}
/**
* Host function that prepares data array and passes it to the CUDA kernel.
*/
int main(void)
{
// pointer for host memory
float *odata, * h_a, * h_b;
float *idata;
int width;
int height;
printf("please enter the width of A\n");
scanf("%d", &width);
printf("please enter the height of A\n");
scanf("%d", &height);
h_a = (float *)malloc(sizeof(float)*width*height);
h_b = (float *)malloc(sizeof(float)*height*width);
for(int n=0;n<height;n++)
for(int i=width*n;i<width*(n+1);i++)
h_a[i]=n;
for(int i=0;i<height*width;i++)
h_b[i]=0;
// Part 1 of 5: allocate host and device memor
cudaMalloc((void **)&idata , sizeof(float)*width*height);
cudaMalloc((void **)&odata , sizeof(float)*height*width);
cudaMemcpy(idata,h_a,sizeof(float)*width*height,cudaMemcpyHostToDevice);
cudaMemcpy(odata,h_b,sizeof(float)*height*width,cudaMemcpyHostToDevice);
dim3 dimGrid(width / TILE_DIM,height / TILE_DIM);
dim3 dimBlock(TILE_DIM, TILE_DIM);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
transposeCoalesced<<< dimGrid,dimBlock>>>( idata,odata,width,height );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaThreadSynchronize();
cudaMemcpy(h_b,odata,sizeof(float)*height*width,cudaMemcpyDeviceToHost);
// Part 5 of 5: verify the data returned to the host is correct
for(int i=0;i<height*width;i++)
{
printf("%f\t", h_b[i]);
}
printf("%f\n", elapsedTime);
cudaFree(idata);
cudaFree(odata);;
// free host memory
free(h_a);
free(h_b);
system("pause");
return 0;
}