581
社区成员
发帖
与我相关
我的任务
分享
__global__
void kernel_sim64_per_layer( uint nNode,
ulong* vSim,
uint* vTruth,
uint* vNode,
uint* vFaninOff,
uint* vFanin,
ulong* vWatch )
{
const unsigned int tid_g = blockDim.x * blockIdx.x + threadIdx.x;
if(tid_g >= nNode) return;
uint nodeid = vNode[tid_g];
uint truth = vTruth[nodeid];
uint offset = vFaninOff[tid_g];
uint nFanins = vFaninOff[tid_g+1] - vFaninOff[tid_g];
__syncthreads();
// watch window for debug
vWatch[tid_g*3 + 0] = nodeid;
vWatch[tid_g*3 + 1] = tid_g;
vWatch[tid_g*3 + 2] = offset;
}
__device__
void kernel_sim64_onenode( ulong& tmp_u64,
uint truth,
uint nFanins,
ulong* vSimFanin )
{
tmp_u64 = 0;
ulong minTerm_u64;
for(int truthEntry = 0; truthEntry < (1<<nFanins); truthEntry++)
{
if(!TruthHasBit(truth, truthEntry)) continue;
minTerm_u64 = 0xffffffffffffffff;
for(int fanin = 0; fanin < nFanins; fanin++)
{
if(truthEntry & (1 << fanin))
minTerm_u64 &= vSimFanin[fanin];
else
minTerm_u64 &= ~vSimFanin[fanin];
}
tmp_u64 |= minTerm_u64;
}
}
__global__
void kernel_sim64_per_layer( uint nNode,
ulong* vSim,
uint* vTruth,
uint* vNode,
uint* vFaninOff,
uint* vFanin,
ulong* vWatch )
{
const unsigned int tid_g = blockDim.x * blockIdx.x + threadIdx.x;
if(tid_g >= nNode) return;
uint nodeid = vNode[tid_g];
uint truth = vTruth[nodeid];
uint offset = vFaninOff[tid_g];
uint nFanins = vFaninOff[tid_g+1] - vFaninOff[tid_g];
ulong tmp_u64 = 0;
// assume fanin number <= 4
__shared__ ulong vSimFanin[4];
for(int fanin = 0; fanin < nFanins; fanin++) {
vSimFanin[fanin] = vSim[FaninID(vFanin, offset, fanin)];
}
kernel_sim64_onenode(tmp_u64, truth, nFanins, vSimFanin);
//__syncthreads();
vSim[nodeid] = tmp_u64;
// watch window for debug
vWatch[tid_g*3 + 0] = nodeid;
vWatch[tid_g*3 + 1] = vSimFanin[0];
vWatch[tid_g*3 + 2] = vSimFanin[1];
}
void cuda_simulate_once()
{
int i, j, nSim;
nBlock = (nNode / 256) + 1;
nThread = 256;
dim3 dimGrid (nBlock, 1, 1);
dim3 dimBlock(nThread, 1, 1);
Abc_NtkForEachCi(pNtk, pNode, j)
{
h_vSim[pNode->Id] = h_vRandCi[j];
}
#ifdef _CZH_DEBUG_
printf("\n ini per-layer h_vSim: \n");
print_ulong_array(h_vSim, nObj);
#endif
for(i = 0; i < (level + nSimIters - 1); i++)
{
// malloc the watch window
sWatch = WATCH_NUM * nNode * sizeof(ulong);
cutilSafeCall( cudaMallocHost((void**)&h_vWatch, sWatch) );
cudaMalloc((void**)&d_vWatch, sWatch);
cudaMemcpy(d_vSim, h_vSim, sSim, cudaMemcpyHostToDevice);
kernel_sim64_per_layer <<< dimGrid, dimBlock >>>( nNode,
d_vSim,
d_vTruth,
d_vNode,
d_vFaninOff,
d_vFanin,
d_vWatch );
CUDA_SAFE_CALL(cudaMemcpy(h_vSim, d_vSim, sSim, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_vWatch, d_vWatch, sWatch, cudaMemcpyDeviceToHost));
#ifdef _CZH_DEBUG_
printf("\n Watch Window: \n");
for(j = 0; j < nNode; j++){
for(int k = 0; k < WATCH_NUM; k++){
printf("w%d: %6ld\t", k, h_vWatch[j*WATCH_NUM + k]);
}
printf("\n");
}
#endif
#ifdef _CZH_DEBUG_
printf("\n i=%d, nSim: %d, per-layer h_vSim: \n", i, nSim);
print_ulong_array(h_vSim, nObj);
#endif
if(i >= level-1) {
nSim = i - (level-1);
// record the simulation results for all COs in the golden case
Abc_NtkForEachCo(pNtk, pNode, j)
{
h_vGoldenCo[nSim*nCo + j] = h_vSim[Abc_ObjFanin0(pNode)->Id];
}
}
if(i+1 < nSimIters) {
Abc_NtkForEachCi(pNtk, pNode, j)
{
h_vSim[pNode->Id] = h_vRandCi[(i+1) * nCi + j];
}
#ifdef _CZH_DEBUG_
printf("\n i: %d, h_vSim: \n", i);
print_ulong_array(h_vSim, nObj);
#endif
}
}
}
#if CTA_DIM>32
#define __barrier { __syncthreads(); }
#else
#define __barrier
#endif
inline __device__
void warp_reduce( TYPE* smem, uint lane )
{
...
}
inline __device__
void block_reduce( TYPE* smem )
{
const uint lane=threadIdx.x&31u;
const uint warpid=threadIdx.x>>5;
warp_reduce( smem, lane ); __barrier
if( lane==0 ){
smem[ warpid ]=smem[ threadIdx.x ];
} __barrier
//如果CTA_DIM>0,则必须同步
但是对于上面的一段代码,我们可以通过如下修改去掉同步,当然会有很少的bank-conflicts:
if( threadIdx.x<blockDim.x>>5 ){
smem[ threadIdx.x ]=smem[ warpid<<5 ];
}
//最后的reduce-op (这里每个循环步都不需要同步以及上面两段代码是安全的是基于warp大小是32,而最大CTA尺寸是
//512的事实,虽然事实上1024的block大小也是正确的(如果支持的话),但如果将来支持的最大CTA尺寸超过在各个大小
//则还需要做些修改(当然,如果那时要用到那么大的尺寸的话,否则简单将CTA尺寸限制在合理的范围 就可以了
for( uint n=blockDim.x>>6; n>0; n>>=1 ){
if( threadIdx.x<n ){
reduceOp( smem );
}
}
//如果后续的代码中CTA_DIM内的多个线程需要使用这个reduced结果则仍需要再次同步: __barrier
}