580
社区成员
发帖
与我相关
我的任务
分享
#define __GPU_ARCH__ 10
#define __GPU_arch_sm_10__ 10
#define __GPU_arch_sm_11__ 11
#define __GPU_arch_sm_12__ 12
#define __GPU_arch_sm_13__ 13
#define architecture(s) __GPU_arch_sm_##s##__
#if __GPU_ARCH__<archtecture(13)
#define IMUL(x,y) __umul24(x,y)
#else
#define IMUL(x,y) ((x)*(y))
#endif
#if __GPU_ARCH__<architecture(13)
#define MAXSAMPLE 768
#else
#define MAXSAMPLE 1024
#endif
#define I0 0
#define I1 !I0
#define SWAP() (I0=I1)
__global__ void findNearestRefPoints(float* dis,float4* particle,float4* sample)
{
uint tidx=IMUL(blockDim.x,blockIdx.x)+threadIdx.x;
uint tidy=IMUL(blockDim.y,blockIdx.y)+threadIdx.y;
uint gloc=IMUL(IMUL(gridDim.x,blockDim.x),tidy)+tidx;
__shared__ float smem[MAXSAMPLE];
uint sloc=IMUL(blockDim.x,threadIdx.y)+threadIdx.x;
float4 ref=sample[sloc];
float dst[2]={.0f,.0f};
uint barea=IMUL(blockDim.x,blockDim.y);
uint offset=area>>1;
gloc=IMUL(gridDim.x,blockIdx.y)+blockIdx.x;
for(uint i=0;i<MAXSAMPLE;i++)
{
float4 src=particle[gloc*barea+i];
smem[sloc]=distance(src,ref);
while(offset>0){
__syncthreads();
if(sloc<offset){
if(smem[sloc]>smem[sloc+offset]){
smem[sloc]=smem[sloc+offset];
}
}
offset>>=1;
}
dst[I0]=smem[0];
SWAP();
if(dst[I0]<dst[I1]){
dst[I0]=dst[I1];
}
}
if(!(threadIdx.x&threadIdx.y)){
dis[gloc]=dst[I0];
}
}
#define __GPU_ARCH__ 10
#define __GPU_arch_sm_10__ 10
#define __GPU_arch_sm_11__ 11
#define __GPU_arch_sm_12__ 12
#define __GPU_arch_sm_13__ 13
#define architecture(s) __GPU_arch_sm_##s##__
#if __GPU_ARCH__<archtecture(13)
#define IMUL(x,y) __umul24(x,y)
#else
#define IMUL(x,y) ((x)*(y))
#endif
#if __GPU_ARCH__<architecture(13)
#define MAXSAMPLE 512
#else
#define MAXSAMPLE 768
#endif
__device__ float distance(const float4& v0,const float4& v1)
{
float ds=v0.x-v1.x;
ds*=ds;
float t=v0.y-v1.y;
ds=t*t+ds;
t=v0.z-v0.z;
ds=t*t+ds;
return sqrtf(ds);
}
__global__ void findNearestRefPoints(float* dis,float4* particle,float4* sample)
{
uint tidx=IMUL(blockDim.x,blockIdx.x)+threadIdx.x;
uint tidy=IMUL(blockDim.y,blockIdx.y)+threadIdx.y;
uint gloc=IMUL(IMUL(gridDim.x,blockDim.x),tidy)+tidx;
__shared__ float smem[MAXSAMPLE];
float4 src=particle[gloc];
uint sloc=IMUL(blockDim.x,threadIdx.y)+threadIdx.x;
float4 ref=sample[sloc];
smem[sloc]=distance(src,ref);
uint offset=IMUL(blockDim.x,blockDim.y); offset>>=1; //必须是2的幂
while(offset>0){
__syncthreads();
if(sloc<offset){
if(smem[sloc]>smem[sloc+offset]){
smem[sloc]=smem[sloc+offset]; //不需要保存原值
}
}
offset>>=1;
}
if(!(threadIdx.x&threadIdx.y)){
gloc=IMUL(gridDim.x,blockIdx.y)+blockIdx.x;
dis[gloc]=smem[0];
}
}