实在没办法了,麻烦看看这个程序,尤其请l7331014来看看

hnuzhoulin 2010-04-06 12:10:02
//myfirst.cu文件
#include "myfirst_kernel.cu"
#include "cutil.h"
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <math.h>

int main( int argc,char** argv) //the main program added by zhoulin 2010.3.4
{
CUT_DEVICE_INIT(argc, argv);

unsigned int memSize = sizeof( float) * numBodies;
//host端分配内存
clock_t* timer=(clock_t*)malloc(32*sizeof(clock_t));
if(timer==NULL) {printf("memory of timer is fault");exit(0);}
float4* h_pos=(float4*)malloc(memSize);
if(h_pos==NULL) {printf("memory of h_pos is fault");exit(0);}
float4* h_vel=(float4*)malloc(memSize);
if(h_vel==NULL) {printf("memory of h_vel is fault");exit(0);}

//生成初试数据
float alat=1.5496f;
float disp=0.5f;
int index=1;
float rcell[3][4]={0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.0,0.5};
srand(int(time(NULL)/2));
for(int k=0;k<8;k++)
{
for(int j=0;j<8;j++)
{
for(int i=0;i<4;i++)
{
for(int L=0;L<4;L++)
{
h_pos[index].x=alat*(i+rcell[1][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].y=alat*(i+rcell[2][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].z=alat*(i+rcell[3][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].w=1.0f;
h_vel[index].x=0.0f;
h_vel[index].y=0.0f;
h_vel[index].z=0.0f;
h_vel[index].w=0.0f;
printf("粒子%d的位置:%f#%f#%f#%f#\n",index,h_pos[index].x,h_pos[index].y,h_pos[index].z);
//printf("速度:%f-%f-%f-%f\n",h_vel[index].x,h_vel[index].y,h_vel[index].z);
index++;
}
}
}
}

//device端分配内存
clock_t* dtimer;
CUDA_SAFE_CALL(cudaMalloc((void**)&dtimer, sizeof(clock_t)*16*2));
float4* d_vel;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vel, memSize));
float4* d_pos;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_pos, memSize));


//向显存拷入数据
printf("位置信息拷入设备端\n");
CUDA_SAFE_CALL(cudaMemcpy(d_pos, h_pos, memSize,cudaMemcpyHostToDevice));
printf("速度信息拷入设备端\n");
CUDA_SAFE_CALL(cudaMemcpy(d_vel, h_vel, memSize,cudaMemcpyHostToDevice));

//运行核函数
int sharedMemSize = q*p * sizeof(float4);
dim3 threads(p,q,1);
dim3 grid(16, 1, 1);
integrateBodies<<< grid, threads, sharedMemSize >>>(d_pos, d_vel,dtimer);

// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");

//将数据拷回主机内存
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_pos, d_pos, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_vel, d_vel, memSize, cudaMemcpyDeviceToHost));
//释放存储器
free(h_pos);
free(h_vel);
free(timer);
CUDA_SAFE_CALL(cudaFree(d_pos));
CUDA_SAFE_CALL(cudaFree(d_vel));
CUDA_SAFE_CALL(cudaFree(dtimer));

//时间测试
clock_t minStart = timer[0];
clock_t maxEnd = timer[16];

for (int i = 1; i < 16; i++)
{
minStart = timer[i] < minStart ? timer[i] : minStart;
maxEnd = timer[16+i] > maxEnd ? timer[16+i] : maxEnd;
}

printf("time = %d\n", maxEnd - minStart);
CUT_EXIT(argc, argv); //exit CUDA
}

第二个是kernel文件

#ifndef _MYFIRST_KERNEL_H_
#define _MYFIRST_KERNEL_H_
#include <math.h>

#define p 64
#define q 1
#define numBodies 1024
#define deltaTime 0.01
#define damping 0.5
#define eps 0.001

#define LOOP_UNROLL 4

// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]

__device__ float3 bodyBodyInteraction(float3 ai, float4 bi, float4 bj)
{
float3 r;
// r_ij [3 FLOPS]
r.x = bi.x - bj.x;
r.y = bi.y - bj.y;
r.z = bi.z - bj.z;

// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
distSqr += (float)eps;

// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float invDist = 1.0f / sqrtf(distSqr);
float invDistCube = invDist * invDist * invDist;

//float distSixth = distSqr * distSqr * distSqr;
//float invDistCube = 1.0f / sqrtf(distSixth);

// s = m_j * invDistCube [1 FLOP]
float s = bj.w * invDistCube;

// a_i = a_i + s * r_ij [6 FLOPS]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;

return ai;
}

__device__ float3 gravitation(float4 myPos, float3 accel)
{
extern __shared__ float4 sharedPos[];
long i=0;
for (unsigned int counter = 0; counter < blockDim.x; )
{
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter++;
#if LOOP_UNROLL > 1
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter++;
#endif
#if LOOP_UNROLL > 2
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter += 2;
#endif
#if LOOP_UNROLL > 4
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter += 4;
#endif
}

return accel;
}


__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)
{
extern __shared__ float4 sharedPos[];

float3 acc = {0.0f, 0.0f, 0.0f};

int numTiles = numBodies / (p * q);
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++)
{
sharedPos[gtid] = positions[gtid];
__syncthreads();
acc = gravitation(bodyPos, acc);
__syncthreads();
}
return acc;
}

__global__ void
integrateBodies(float4* oldPos, float4* oldVel,clock_t* timer)
{
extern __shared__ float4 sharedPos[];
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if (threadIdx.x==0) timer[blockIdx.x]=clock();
float4 pos = oldPos[index];
float3 accel = computeBodyAccel(pos, oldPos);
float4 vel = oldVel[index];

vel.x +=(float)(accel.x * deltaTime);
vel.y +=(float)(accel.y * deltaTime);
vel.z +=(float)(accel.z * deltaTime);

vel.x *=(float) damping;
vel.y *=(float) damping;
vel.z *=(float) damping;

// new position = old position + velocity * deltaTime
pos.x +=(float)(vel.x * deltaTime);
pos.y +=(float)(vel.y * deltaTime);
pos.z +=(float)(vel.z * deltaTime);

// store new position and velocity
oldPos[index] = pos;
oldVel[index] = vel;
if (threadIdx.x==0) timer[blockIdx.x+blockDim.x]=clock();
}

#endif // #ifndef _NBODY_KERNEL_H_


在xp上面模拟运行调试的时候老是出错啊,现在是什么内存不能为write了

...全文
174 3 打赏 收藏 转发到动态 举报
写回复
用AI写文章
3 条回复
切换为时间正序
请发表友善的回复…
发表回复
  • 打赏
  • 举报
回复
唉.先把
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
改为
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,sizeof(clock_t)*16*2, cudaMemcpyDeviceToHost));
再说.

有问题的话,晚上再看.
  • 打赏
  • 举报
回复
就当阅读考试了.呵呵.

问题是否是在这里啊?
__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)
{
......
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
......
sharedPos[gtid] = positions[gtid];
__syncthreads();
......
}
这里gtid是整个grid的x方向宽度大小:
dim3 threads(p,q,1);
dim3 grid(16, 1, 1);
因此sharedPos要16*p=16*64=1024个float4大小.
而在启动kernel时分配的动态smem大小是:
int sharedMemSize = q*p * sizeof(float4);
只有q*p=64*1=64个float4!

写sharedPos越界了.呵呵。
hnuzhoulin 2010-04-06
  • 打赏
  • 举报
回复
[Quote=引用 1 楼 l7331014 的回复:]
唉.先把
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
改为
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,sizeof(clock_t)*16*2, cudaMemcpyDeviceToHost));
再说.

有问题的话,晚上再看.
[/Quote]
这个不好意思啊,这个已经改过了,只是贴上来的时候搞错了,对不住了。。

231

社区成员

发帖
与我相关
我的任务
社区描述
CUDA on Windows XP
社区管理员
  • CUDA on Windows XP社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧