实在没办法了，麻烦看看这个程序，尤其请l7331014来看看

hnuzhoulin 2010-04-06 12:10:02

//myfirst.cu文件

#include "myfirst_kernel.cu"

#include "cutil.h"

#include <cstdio>

#include <cstdlib>

#include <stdlib.h>

#include <stdio.h>

#include <time.h>

#include <string.h>

#include <math.h>



 int main( int argc,char** argv)  //the main program added by zhoulin 2010.3.4

    {

	CUT_DEVICE_INIT(argc, argv);

	

	unsigned int memSize = sizeof( float) * numBodies;

	//host端分配内存

	clock_t* timer=(clock_t*)malloc(32*sizeof(clock_t)); 

	if(timer==NULL) {printf("memory of timer is fault");exit(0);}

	float4* h_pos=(float4*)malloc(memSize);

	if(h_pos==NULL) {printf("memory of h_pos is fault");exit(0);}

	float4* h_vel=(float4*)malloc(memSize);

	if(h_vel==NULL) {printf("memory of h_vel is fault");exit(0);}

	

	//生成初试数据  

    float alat=1.5496f;

    float disp=0.5f;

	int index=1;

    float rcell[3][4]={0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.0,0.5};

    srand(int(time(NULL)/2));

    for(int k=0;k<8;k++)  

        { 

        for(int j=0;j<8;j++)

           {      

           for(int i=0;i<4;i++)

               {    

              for(int L=0;L<4;L++)

                  { 

			h_pos[index].x=alat*(i+rcell[1][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);

			h_pos[index].y=alat*(i+rcell[2][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);

			h_pos[index].z=alat*(i+rcell[3][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);

			h_pos[index].w=1.0f;

			h_vel[index].x=0.0f;

			h_vel[index].y=0.0f;

			h_vel[index].z=0.0f;

			h_vel[index].w=0.0f;

		printf("粒子%d的位置：%f#%f#%f#%f#\n",index,h_pos[index].x,h_pos[index].y,h_pos[index].z);

			//printf("速度：%f-%f-%f-%f\n",h_vel[index].x,h_vel[index].y,h_vel[index].z);

					 index++;

                   }

                }

            }

         }



	//device端分配内存

         clock_t* dtimer;

	CUDA_SAFE_CALL(cudaMalloc((void**)&dtimer, sizeof(clock_t)*16*2));

	float4* d_vel;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_vel, memSize));

	float4* d_pos;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_pos, memSize));



	

	//向显存拷入数据

	printf("位置信息拷入设备端\n");

	CUDA_SAFE_CALL(cudaMemcpy(d_pos, h_pos, memSize,cudaMemcpyHostToDevice));

	printf("速度信息拷入设备端\n");

	CUDA_SAFE_CALL(cudaMemcpy(d_vel, h_vel, memSize,cudaMemcpyHostToDevice));



	//运行核函数

    int sharedMemSize = q*p * sizeof(float4); 

    dim3 threads(p,q,1);

	dim3 grid(16, 1, 1);

	integrateBodies<<< grid, threads, sharedMemSize >>>(d_pos, d_vel,dtimer);



    	// check if kernel invocation generated an error

	 CUT_CHECK_ERROR("Kernel execution failed");



        //将数据拷回主机内存

    CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));

    CUDA_SAFE_CALL(cudaMemcpy(h_pos, d_pos, memSize, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(h_vel, d_vel, memSize, cudaMemcpyDeviceToHost));

	//释放存储器

	free(h_pos);

	free(h_vel);

	free(timer);

	CUDA_SAFE_CALL(cudaFree(d_pos));

	CUDA_SAFE_CALL(cudaFree(d_vel));

	CUDA_SAFE_CALL(cudaFree(dtimer));



	//时间测试

   clock_t minStart = timer[0];

   clock_t maxEnd = timer[16];



   for (int i = 1; i < 16; i++)

       { 

       minStart = timer[i] < minStart ? timer[i] : minStart; 

       maxEnd = timer[16+i] > maxEnd ? timer[16+i] : maxEnd;

        }



   printf("time = %d\n", maxEnd - minStart);

	CUT_EXIT(argc, argv);  //exit CUDA

     }

第二个是kernel文件



#ifndef _MYFIRST_KERNEL_H_

#define _MYFIRST_KERNEL_H_

#include <math.h>



#define p 64

#define q 1

#define numBodies  1024

#define deltaTime  0.01  

#define damping  0.5    

#define eps  0.001 



#define LOOP_UNROLL 4



// Macros to simplify shared memory addressing

#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]



__device__ float3 bodyBodyInteraction(float3 ai, float4 bi, float4 bj) 

{

    float3 r;

    // r_ij  [3 FLOPS]

    r.x = bi.x - bj.x;

    r.y = bi.y - bj.y;

    r.z = bi.z - bj.z;



    // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]

    float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;

    distSqr += (float)eps;



    // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]

    float invDist = 1.0f / sqrtf(distSqr);

    float invDistCube =  invDist * invDist * invDist;



    //float distSixth = distSqr * distSqr * distSqr;

    //float invDistCube = 1.0f / sqrtf(distSixth);

    

    // s = m_j * invDistCube [1 FLOP]

    float s = bj.w * invDistCube;



    // a_i =  a_i + s * r_ij [6 FLOPS]

    ai.x += r.x * s;

    ai.y += r.y * s;

    ai.z += r.z * s;



    return ai;

}



__device__ float3 gravitation(float4 myPos, float3 accel)

{

    extern __shared__ float4 sharedPos[];

    long i=0;

    for (unsigned int counter = 0; counter < blockDim.x; ) 

    {

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

	counter++;

#if LOOP_UNROLL > 1

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

	counter++;

#endif

#if LOOP_UNROLL > 2

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

	counter += 2;

#endif

#if LOOP_UNROLL > 4

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

        accel = bodyBodyInteraction(accel, SX(i++), myPos); 

	counter += 4;

#endif

    }



    return accel;

}





__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)

{

    extern __shared__ float4 sharedPos[];



    float3 acc = {0.0f, 0.0f, 0.0f};

    

    int numTiles = numBodies / (p * q);

    int gtid = blockIdx.x * blockDim.x + threadIdx.x;

    for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++) 

    {

        sharedPos[gtid] = positions[gtid];       

        __syncthreads();

        acc = gravitation(bodyPos, acc);

        __syncthreads();

    }

    return acc;

}



__global__ void

integrateBodies(float4* oldPos, float4* oldVel,clock_t* timer)

{

    extern __shared__ float4 sharedPos[];

    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; 

    if (threadIdx.x==0)  timer[blockIdx.x]=clock();

    float4 pos = oldPos[index];   

    float3 accel = computeBodyAccel(pos, oldPos);

    float4 vel = oldVel[index];

       

    vel.x +=(float)(accel.x * deltaTime);

    vel.y +=(float)(accel.y * deltaTime);

    vel.z +=(float)(accel.z * deltaTime);  



    vel.x *=(float) damping;

    vel.y *=(float) damping;

    vel.z *=(float) damping;

        

    // new position = old position + velocity * deltaTime

    pos.x +=(float)(vel.x * deltaTime);

    pos.y +=(float)(vel.y * deltaTime);

    pos.z +=(float)(vel.z * deltaTime);



    // store new position and velocity

    oldPos[index] = pos;

    oldVel[index] = vel;

	if (threadIdx.x==0)  timer[blockIdx.x+blockDim.x]=clock();

}



#endif // #ifndef _NBODY_KERNEL_H_

在xp上面模拟运行调试的时候老是出错啊，现在是什么内存不能为write了

...全文

174 3 打赏收藏转发到动态举报

写回复

用AI写文章

3 条回复

切换为时间正序

请发表友善的回复…

发表回复

无心人_过过小日子 2010-04-06

打赏
举报

唉.先把
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
改为
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,sizeof(clock_t)*16*2, cudaMemcpyDeviceToHost));
再说.

有问题的话,晚上再看.

无心人_过过小日子 2010-04-06

打赏
举报

就当阅读考试了.呵呵.

问题是否是在这里啊?
__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)
{
......
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
......
sharedPos[gtid] = positions[gtid];
__syncthreads();
......
}
这里gtid是整个grid的x方向宽度大小:
dim3 threads(p,q,1);
dim3 grid(16, 1, 1);
因此sharedPos要16*p=16*64=1024个float4大小.
而在启动kernel时分配的动态smem大小是:
int sharedMemSize = q*p * sizeof(float4);
只有q*p=64*1=64个float4!

写sharedPos越界了.呵呵。

hnuzhoulin 2010-04-06

打赏
举报

[Quote=引用 1 楼 l7331014 的回复:]
唉.先把
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
改为
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,sizeof(clock_t)*16*2, cudaMemcpyDeviceToHost));
再说.

有问题的话,晚上再看.
[/Quote]
这个不好意思啊，这个已经改过了，只是贴上来的时候搞错了，对不住了。。

第1章程序设计原理率章首先慨述良好程序设计的重要原理，特别是它们在大型项目中的应用；然后介绍用于发现有救算法的方法，如面向对象的设计和白顶向下的设计。在此过程中，我们提出将在后继章节中论述的...

实在没有办法每时每刻看邮箱，就算看了还要逐个回复很麻烦，而且这篇博客其实是求助类的，想要继续完善的，但是拿到代码的网友基本都没有回复我给我一些建议和改进的办法。我做了一个百度云的链接，大家不用留言了，...

以前写程序，参考的芯片资料都是中文的，感觉英语的看着头大，也看不懂，所以就没想着要去参考参考原厂的datasheet，看了两天的中文资料，这个看了写的程序不正确，马上再下一个，结果桌面上下的DS18B20的资料放了一...

有的bug很简单，看看错误信息就知道，有的bug很复杂，我们需要知道出错时，哪些变量的值是正确的，哪些变量的值是错误的，因此，需要一整套调试程序的手段来修复bug。下面我们来看下常用的Python调试方法1. 断点打印...

平时使用vs开发时，IDE都帮我们...今天这个电脑只装了VC++，没有装MinGW，看看如何使用VC++的编译器来编译程序。 1. cl.exe使用说明 1.1 设置环境变量 cl.exe使用环境设置：在控制台使用cl.exe时需设置PATH、INCL...