353
社区成员
发帖
与我相关
我的任务
分享
__global__ void Laplace_d (float *A, float *B, int *N_t){
int tx = threadIdx.x; int ty = threadIdx.y;
int j = blockIdx.x * blockDim.x + tx;
int i = blockIdx.y * blockDim.y + ty;
int index, left, right, top, bottom;
int N=*N_t;
index = i*N +j;
left = i*N+ j-1;
right = i*N+ j+1;
top = (i-1)*N +j;
bottom = (i+1)*N+j;
if(i>0 && i<N-1 && j>0 && j<N-1){
B[index]=0.25*( A[left]+A[right]+A[top]+A[bottom])*0.9+0.1*B[index];
}
}
texture < float, 1, cudaReadModeElementType> coalesed;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
__global__
void Laplace_d(float* D,int* N)
{
int tidx=blockDim.x*blockIdx.x+threadIdx.x;
int tidy=blockDim.y*blockIdx.y+threadIdx.y;
const int pitch=*N;
const int gloc=pitch*tidy+tidx;
__shared__ float smem[BLOCK_SIZE+2][BLOCK_SIZE+2];
int sidx=threadIdx.x+1;
int sidy=threadIdx.y+1;
smem[sidy][sidx]=tex1Dfetch(coalesed,gloc);
if(threadIdx.x==0){
smem[sidy][0 ]=tex1Dfetch(coalesed,gloc-1 ); //不需要担心索引越界,因为线型纹理在索引超出范围后会返回0,对结果没有影响
smem[sidy][BLOCK_SIZE+1]=tex1Dfetch(coalesed,gloc+BLOCK_SIZE);
}
if(threadIdx.y==0){
smem[0][sidx]=tex1Dfetch(coalesed,gloc-pitch);
smem[BLOCK_SIZE+1][sidx]=tex1Dfetch(coalesed,gloc+__mul24(pitch,BLOCK_SIZE));
}
__syncthreads();
if(tidx > 0 && tidx < pitch-1 && tidy > 0 && tidy < pitch -1){//之前的代码忘记了边界情况,后来改正了
D[gloc]=0.25*0.9*(smem[sidy][sidx-1]+smem[sidy][sidx+1]+smem[sidy-1][sidx]+smem[sidy+1][sidx])+0.1*smem[sidy][sidx];
}
}
while (k<ITERATIONS){
cudaBindTexture(0,&coalesed, device2,&channelDesc,size);
Laplace_d<<<dimGrid, dimBlock>>>(device1,n);
cudaUnbindTexture(&coalesed);
cudaBindTexture(0,&coalesed, device1,&channelDesc,size);
Laplace_d<<<dimGrid, dimBlock>>>(device2,n);
cudaUnbindTexture(&coalesed);
k+=2;
}
while(k<ITERATIONS){
bind_texture(tex,data[SRC]);
laplace_solver(data[DST],N);
++k;
}
#define DIMX 16
#define DIMY 16
//用纹理避免边界元素的非合并访问
//B :D
//A :绑定到纹理coalesed
//N_t:F
__global__
void laplace(float* D,const float* F,const float weight)
{
int tidx=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;
int tidy=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;
const int pitch=__mul24(gridDim.x,blockDim.x);
const int gloc=__mul24(pitch,tidy)+tidx;
__shared__ float smem[DIMY+2][DIMX+2];
tidx=threadIdx.x+1;
tidy=threadIdx.y+1;
smem[tidy][tidx]=tex1Dfetch(coalesed,gloc);
if(threadIdx.x==0){
smem[tidy][0 ]=tex1Dfetch(coalesed,gloc-1 ); //不需要担心索引越界,因为线型纹理在索引超出范围后会返回0,对结果没有影响
smem[tidy][DIMX+1]=tex1Dfetch(coelesed,gloc+DIM);
}
if(threadIdx.y==0){
smem[0 ][tidx]=tex1Dfetch(coelesed,gloc-pitch);
smem[DIMY+1][tidx]=tex1Dfetch(coalesed,gloc+__mul24(pitch,DIMY));
} __syncthreads();
D[gloc]=0.25f*weight*(smem[tidy][tidx-1]+smem[tidy][tidx+1]+smem[tidy-1][tidx]+smem[tidy+1][tidx])+(1.f-weight)*F[gloc];
}
while (k<ITERATIONS){
Laplace_d<<<dimGrid, dimBlock>>>(device1, device2,n);
Laplace_d<<<dimGrid, dimBlock>>>(device2, device1,n);
k+=2;
}
while (k<ITERATIONS){
Laplace_h(cpu_data2, cpu_data,N);
Laplace_h(cpu_data, cpu_data2,N);
k+=2;
}
//I did a new version and this one works, so any comments are welcome.
__global__ void Laplace_d (float *A, float *B, int *N_t){
int tx = threadIdx.x; int ty = threadIdx.y;
int j = blockIdx.x * blockDim.x + tx;
int i = blockIdx.y * blockDim.y + ty;
int index;
int N=*N_t;
index = i*N +j;
//read data from global memory to shared memory
__shared__ float sdata[BLOCK_SIZE][BLOCK_SIZE];
sdata[ty][tx] = A[index];
__syncthreads();
if(i>0 && i<N-1 && j>0 && j<N-1){
float left = 0, right =0, top = 0, bottom =0;
//top
if(ty > 0)
top= sdata[ty-1][tx];
else top = A[index-N];
//left
if(tx > 0)
left = sdata[ty][tx-1];
else left = A[index-1];
//right
if(tx < BLOCK_SIZE-1)
right = sdata[ty][tx+1];
else right = A[index+1];
//bottom
if(ty < BLOCK_SIZE-1)
bottom = sdata[ty+1][tx];
else bottom = A[index+N];
B[index]=0.25*( left+right+top+bottom)*0.9+0.1*sdata[ty][tx];
}
}