353
社区成员
发帖
与我相关
我的任务
分享
原排列:
form<0>
x x x x .. x y y y y .. y
x x x x .. x y y y y .. y
x x x x .. x y y y y .. y
x x x x .. x y y y y .. y
. . . . .. . . . . . .. .
x x x x .. x y y y y .. y
z z z z .. z a a a a .. a
z z z z .. z a a a a .. a
z z z z .. z a a a a .. a
z z z z .. z a a a a .. a
. . . . .. . . . . . .. .
z z z z .. z a a a a .. a
还可以变化为下面2个排列:
form<1>
x x x x .. x y y y y .. y z z z z .. z a a a a .. a
x x x x .. x y y y y .. y z z z z .. z a a a a .. a
x x x x .. x y y y y .. y z z z z .. z a a a a .. a
x x x x .. x y y y y .. y z z z z .. z a a a a .. a
. . . . .. . . . . . .. . . . . . .. . . . . . .. .
x x x x .. x y y y y .. y z z z z .. z a a a a .. a
form<2>
x x x x x x x x x x x x x x x .. x
. . . . . . . . . . . . . . . .. .
x x . . . . . . . . . . . . x .. x
y y y y y y y y y y y y y y y .. y
. . . . . . . . . . . . . . . .. .
y y . . . . . . . . . . . . y .. y
z z z z z z z z z z z z z z z .. z
. . . . . . . . . . . . . . . .. .
z z . . . . . . . . . . . . z .. z
a a a a a a a a a a a a a a a .. a
. . . . . . . . . . . . . . . .. .
a a . . . . . . . . . . . . a .. a
如果这4个区域每个区域中的每个数都相同(那作者的意思是?),则可以如下实施:
//const_cache[0]=x+1
//const_cache[1]=y+2
//const_cache[2]=z+3
//const_cache[3]=a+4
__constant__ unsigned int const_cache[4];
对于form<0>:
__global__ void kernel_derisible_form_0(unsigned int* dst)
{
unsigned int tidx =__umul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int tidy =__umul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int sizex=__umul24(gridDim.x,blockDim.x);
unsigned int gloc =__umul24(size,tidy)+tidx;
sizex>>=1;
unsigned int sizey=__umul24(gridDim.y,blockDim.y);
sizey>>=1;
dst[gloc]=const_cache[((tidy/sizey)<<1)+(tidx/sizex)];
}
对于form<1>:
__global__ void kernel_derisible_form_1(unsigned int* dst)
{
unsigned int tidx=__umul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int tidy=__umul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int size=__umul24(gridDim.x,blockDim.x);
unsigned int gloc=__umul24(size,tidy)+tidx;
size>>=2;
dst[gloc]=const_cache[tidx/size]; //broadcast to each thread which the should for
}
对于form<2>:
__global__ void kernel_derisible_form_2(unsigned int* dst)
{
unsigned int tidx=__umul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int tidy=__umul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int gloc=__umul24(__umul24(gridDim.x,blockDim.x),tidy)+tidx;
unsigned int cloc=threadIdx.y/(__umul24(gridDim.y,blockDim.y)>>2);
dst[gloc]=const_cache[cloc]; //same the up annotation
}
如果每个区域中的每个元素不一定相同,则可如下处理(事实上更通用了,但对于上面说的特殊情况,上面利用const cache的广播机制的方法可以更高效些):
__global__ void kernel_derisible_form_0_general(unsigned int* dst)
{
unsigned int tidx=__umul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int tidy=__umul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int sizex=__umul24(gridDim.x,blockDim.x);
unsigned int gloc=__umul24(size,tidy)+tidx;
sizex>>=1;
unsigned int sizey=__umul24(gridDim.y,blockDim.y);
sizey>>=1;
dst[gloc]+=((tidy/sizey)<<1)+(tidx/sizex); //注意这里和form<0>的代码中最后的const_cache的索引是一样的
}
最后需要置仪的是half warp的一致性,我所说的一致性是指half warp内的所有thread应该尽可能的访问同一个const cache的address,否则会有bank confilicts.这就看选择哪中数据分布以及数据的数量了(比如要求是16的倍数,但是如果矩阵足够的小(如:矩阵中的4个区域分布在同一个half warp内,那么就另当别论了(那就尽量减少BC,但是这样的话也没有必要使用CUDA了)
__global__ void testkernel(int *d_A, size_t size)
{
int dx = blockDim.x * blockIdx.x + threadIdx.x;
int dy = blockDim.y * blockIdx.y + threadIdx.y;
if( blockIdx.x == 0 && blockIdx.y == 0 )
d_A[dx*size+dy] += 1;
if( blockIdx.x == 0 && blockIdx.y == 1 )
d_A[dx*size+dy] += 2;
if( blockIdx.x == 1 && blockIdx.y == 0 )
d_A[dx*size+dy] += 3;
if( blockIdx.x == 1 && blockIdx.y == 1 )
d_A[dx*size+dy] += 4;
}
/************************************************************************/
/* HelloCUDA */
/************************************************************************/
int main(void)
{
int h_A[8][8] = {{1,1,1,1,2,2,2,2},
{1,1,1,1,2,2,2,2},
{1,1,1,1,2,2,2,2},
{1,1,1,1,2,2,2,2},
{3,3,3,3,4,4,4,4},
{3,3,3,3,4,4,4,4},
{3,3,3,3,4,4,4,4},
{3,3,3,3,4,4,4,4}};
int *d_A, *h_B;
size_t size = 8 * sizeof(int);
size_t pitch = sizeof(int);
size_t rsize = 8;
dim3 dimgrid(2,2);
dim3 dimblock(4,4);
h_B = (int*)malloc(size);
//cudaMalloc( (void **) &d_A, size );
cudaMallocPitch( (void **) &d_A, &pitch,size,rsize );
//cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy2D( d_A, pitch,h_A,pitch, size, rsize,cudaMemcpyHostToDevice );
testkernel<<<dimgrid,dimblock>>>(d_A,rsize);
//cudaMemcpy( h_B, d_A, size, cudaMemcpyDeviceToHost );
cudaMemcpy2D( h_B, pitch,d_A,pitch, size,rsize, cudaMemcpyDeviceToHost );
for(int i = 0; i < 8; i++)
{
for(int j = 0;j < 8; j++)
printf("%2d ",h_B[i*rsize+j]);
printf("\n");
}
cudaFree(d_A);
free(h_B);
__global__ void testkernel(int *d_A, size_t size)
{
int dx = blockDim.x * blockIdx.x + threadIdx.x;
int dy = blockDim.y * blockIdx.y + threadIdx.y;
if( blockIdx.x == 0 && blockIdx.y == 0 )
d_A[dx*size+dy] += 1;
if( blockIdx.x == 0 && blockIdx.y == 1 )
d_A[dx*size+dy] += 2;
if( blockIdx.x == 1 && blockIdx.y == 0 )
d_A[dx*size+dy] += 3;
if( blockIdx.x == 1 && blockIdx.y == 1 )
d_A[dx*size+dy] += 4;
}