353
社区成员
发帖
与我相关
我的任务
分享
laneid=threadIdx.x&31u;constuint warpid=threadIdx.x>>5;constuint stride=warpid<<5;if(laneid<16u){
smem[(laneid<<1)+stride+blockDim.x ]=smem[laneid+stride ];
smem[(laneid<<1)+stride+blockDim.x+1]=smem[laneid+stride+16u];
}//output the at1out[tidx]=smem[threadIdx.x];
[/Quote]
//block layout:(THREADS,1,1)
//grid layout:(1024*32/THREAD,1)
__global__
void kernel(float* out,const float* in)
{
const uint tidx=__umul24(blockDim.x,blockIdx.x)+threadIdx.x;
extern __shared__ float smem[];
//load at0 form global mem to shared mem
smem[threadIdx.x+blockDim.x]=in[tidx];
//compute each group at1 through one warp
smem[threadIdx.x]=smem[threadIdx.x+blockDim.x]+__frand(0);
//note:there`s no need to used __syncthreads() function
//because there`s no relationship between every two warps
//the computing in every warp depend only itself
const uint laneid=threadIdx.x&31u;
const uint warpid=threadIdx.x>>5;
const uint stride=warpid<<5;
if(laneid<16u){
smem[(laneid<<1)+stride+blockDim.x ]=smem[laneid+stride ];
smem[(laneid<<1)+stride+blockDim.x+1]=smem[laneid+stride+16u];
}
//output the at1
out[tidx]=smem[threadIdx.x];
}