580
社区成员
发帖
与我相关
我的任务
分享
#include <cmath>
#include <cutil_inline.h>
typedef float Real;
#define BLOCK_SIZE 8
__device__ void
kernel_vec_subsum(unsigned int nz, Real* g_idata, Real* g_odata)
{
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ Real sdata[BLOCK_SIZE];
sdata[tid] = (i < nz) ? g_idata[i] : 0;
__syncthreads();
for(unsigned int s = BLOCK_SIZE / 2; s > 0; s >>= 1)
{
if (tid < s) sdata[tid] += sdata[tid + s];
__syncthreads();
}
__syncthreads();
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
__device__ void
kernel_vec_sum(unsigned int nz, Real* g_vec, Real* g_sum)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int size = nz;
while( size > 1 )
{
kernel_vec_subsum(size, g_vec, g_vec);
size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
__syncthreads();
}
if(i == 0) g_sum[0] = g_vec[0];
}
__global__ void
global_vec_sum(unsigned int nz, Real* g_vec, Real* g_sum)
{
kernel_vec_sum(nz, g_vec, g_sum);
}
Real
vec_sum(unsigned int nz, Real* vec)
{
Real* g_idata;
size_t mem_size = nz * sizeof(Real);
cutilSafeCall( cudaMalloc( (void**) &g_idata, mem_size));
cutilSafeCall( cudaMemcpy( g_idata, vec, mem_size, cudaMemcpyHostToDevice) );
Real* g_odata;
cutilSafeCall( cudaMalloc( (void**) &g_odata, sizeof(Real)));
global_vec_sum <<< (nz + BLOCK_SIZE - 1) / (BLOCK_SIZE), BLOCK_SIZE >>>
(nz, g_idata, g_odata);
Real sum;
cutilSafeCall( cudaMemcpy( &sum, g_odata, sizeof(Real), cudaMemcpyDeviceToHost) );
cutilSafeCall(cudaFree(g_idata));
cutilSafeCall(cudaFree(g_odata));
return sum;
}
void test_vec_sum()
{
unsigned int nz = 80 * BLOCK_SIZE + 3; // 80 * 8 + 3 = 643
Real* vec = new Real[nz];
for(unsigned int i = 0; i < nz; ++i)
vec[i] = 1.0;
Real sum = vec_sum(nz, vec);
printf("%5.2f ", sum);
delete[] vec;
}
int main( int argc, char** argv )
{
cutilDeviceInit(argc, argv);
test_vec_sum();
cutilExit(argc, argv);
}
__device__ void
kernel_vec_sum_v2(unsigned int nz, Real* g_vec, Real* g_sum)
// 这里要求 g_sum 是与 g_vec 等长的向量,用于临时交换数据
{
unsigned int tid = threadIdx.x;
unsigned int bid = blockIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int size = nz;
while( size > 1 )
{
kernel_vec_subsum(size, g_vec, g_sum);
size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
__syncthreads();
// 将本轮计算结果 从g_sum 复制到 g_vec 向量中
if( tid == 0 && bid < size )
g_vec[bid] = g_sum[bid];
__syncthreads();
}
}
__global__ void
global_vec_subsum(unsigned int nz, Real* g_idata, Real* g_odata)
{
kernel_vec_subsum(nz, g_idata, g_odata);
}
Real
vec_sum_v2(unsigned int nz, Real* vec)
{
Real* g_idata;
size_t mem_size = nz * sizeof(Real);
cutilSafeCall( cudaMalloc( (void**) &g_idata, mem_size));
cutilSafeCall( cudaMemcpy( g_idata, vec, mem_size, cudaMemcpyHostToDevice) );
Real* g_odata;
cutilSafeCall( cudaMalloc( (void**) &g_odata, mem_size));
while( nz > 1 ) {
unsigned int nz_new = (nz + BLOCK_SIZE - 1) / BLOCK_SIZE;
global_vec_subsum <<< nz_new, BLOCK_SIZE >>>
(nz, g_idata, g_odata);
nz = nz_new;
cutilSafeCall( cudaMemcpy( g_idata, g_odata, nz * sizeof(Real), cudaMemcpyDeviceToDevice) );
}
Real sum;
cutilSafeCall( cudaMemcpy( &sum, g_odata, sizeof(Real), cudaMemcpyDeviceToHost) );
cutilSafeCall(cudaFree(g_idata));
cutilSafeCall(cudaFree(g_odata));
return sum;
}