cuda平台下矩阵向量乘法栈溢出问题

wlpscu1 2009-03-24 11:02:31

我是参考按照一篇文献写出的一个程序，编译成功了，但是在运行的时候，一运行到main()函数，程序就会说你栈溢出。
我的显卡是Getforce 9300MGS， Vs2008 cuda 2.1的，具体实现的就是一个矩阵和一个向量进行运算，函数mv()和mv_kernel()就是文献给的程序，应该没有错误，我只是写了一个main函数和allocMatrixVec（）函数。附件中是它的完整程序。
#include<cuda_runtime.h>
#include<stdio.h>
#include<stdlib.h>

texture<float4,2,cudaReadModeElementType>texRefA;

__global__ void mv_kernel(float*y,cudaArray* A,float* x,int m,int n){

int tx=threadIdx.x;
int ty=threadIdx.y;
int bx=blockIdx.x;
//为了实现更快的数据读取速度，将向量x中的数值读取16*16个元素存放在共享内存中，即xs。

__shared__ float xs[16][16];
//为结果设置一个缓存Ps。
__shared__ float Ps[16][16];
float4 a;
float* Psptr=(float*)Ps+ty*16+tx;
int ay=bx*16+ty;
float *xptr=x+ty*16+tx;
float *xsptr=(float*)xs+tx*4;

*Psptr=0.0f;
int i;
//下面中的判断条件其实就是将n的后8位全部清零，也就是n必须是256的倍数。
//对于n mod 256以外的数据进行额外的处理。
//下面的程序显示了一个线程在一次循环中要处理4个float4数据，一共16个浮点型数据。
for(i=0;i<(n&~255);i+=256,xptr+=256)
{
//将向量x的元素存放到共享内存中，也就是xs。
xs[ty][tx]=*xptr;
__syncthreads();
//?
int ax=tx+(i>>2);
a=tex2D(texRefA,ax ,ay);
*Psptr+=a.x* *xsptr+a.y* *(xsptr+1)+a.z* *(xsptr+2)+a.z* *(xsptr+3);
//由于上面从纹理内存中取了1个float4(4*4=16Byte)类型的数据，那么下面就应该从ax+16处取值
a=tex2D(texRefA,ax+16,ay);
*Psptr+=a.x* *(xsptr+64)+a.y* *(xsptr+65)+a.z* *(xsptr+66)+a.z* *(xsptr+67);
a=tex2D(texRefA,ax+32,ay);
*Psptr+=a.x* *(xsptr+128)+a.y* *(xsptr+129)+a.z* *(xsptr+130)+a.z* *(xsptr+131);
a=tex2D(texRefA,ax+48,ay);
*Psptr+=a.x* *(xsptr+192)+a.y* *(xsptr+193)+a.z* *(xsptr+194)+a.z* *(xsptr+195);
__syncthreads();
}
//下面处理没有处理的元素。
//将未处理的元素放到xs中
if(i+ty*16+tx<n){
xs[ty][tx]=*xptr;
}
__syncthreads();
int j;
for(j=0;j<((n-i)>>6);j++,xsptr+=61){
a=tex2D(texRefA,tx+(i>>2)+(j<<4),ay);
*Psptr+=a.x* *xsptr++ + a.y* *xsptr++ +a.z* *xsptr++ +a.w* *xsptr;

}
__syncthreads();
int remain=(n-i)&63;
if((tx<<2)<remain){
a=tex2D(texRefA,tx+(i>>2)+(j<<4),ay);
*Psptr+=a.x* *xsptr++;
}
if((tx<<2)+1<remain)*Psptr+=a.y**xsptr++;
if((tx<<2)+2<remain)*Psptr+=a.z**xsptr++;
if((tx<<2)+3<remain)*Psptr+=a.w**xsptr;
__syncthreads();

if(tx<8) *Psptr+=*(Psptr+8);
if(tx<4) *Psptr+=*(Psptr+4);
if(tx<2) *Psptr+=*(Psptr+2);
if(tx<1) *Psptr+=*(Psptr+1);

__syncthreads();
if(ty==0&&(bx<<4)+tx<m)y[(bx<<4)+tx]=Ps[tx][0];

}

void mv(float *y,float *A,float *x,int m,int n){
//下面的程序是将矩阵行数变为16的倍数。m除以16，得到块的行数，如果不能够整除，那么就将行数进行扩充。
int blkNum=(m>>4)+((m&15)?1:0);
//height是实际在运算时候的矩阵行数。
int height=blkNum<<4;
//下面的程序是要将列数变为256的整数倍，如果给定矩阵的列不满足，则就进行扩充。
int width=(n&255)?(((n>>8)+1)<<8):n;
dim3 threads(16,16),grid(blkNum,1);
cudaArray *d_A;
float* d_x,*d_y;

//下面利用特定的分配函数，这样可以分配内存可以使得在内存中这些数据是线性排列的。
cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc<float4>();
//由于数据是按4个float类型进行存储的，所以这里的宽度要进行除4操作。
cudaMallocArray(&d_A,&channelDesc,width>>2,height);
cudaMemcpy2DToArray(d_A,0,0,A,n*sizeof(float),n*sizeof(float),m,cudaMemcpyHostToDevice);
cudaBindTextureToArray(texRefA,d_A);
cudaMalloc((void**)&d_x,n*sizeof(float));
cudaMalloc((void**)&d_y,m*sizeof(float));

cudaMemcpy(d_x,x,n*sizeof(float),cudaMemcpyHostToDevice);
mv_kernel<<<grid,threads>>>(d_y,d_A,d_x,m,n);
cudaMemcpy(y,d_y,m*sizeof(float),cudaMemcpyDeviceToHost);

cudaFree(d_y);
cudaFree(d_x);
cudaUnbindTexture(texRefA);
cudaFreeArray(d_A);

}

void allocMatrixVec(float* A,float* x,int m, int n){

int count=m*n;
int i,j;

for(i=0;i<count;i++)
{
A[i]=rand()/1.0;
}
for(j=0;j<n;j++){
x[j]=rand()/1.0;
}
}

void main()
{
int row=700,column=800;
float A[700*800];
float x[800],y[700];
allocMatrixVec(A,x,row,column);
mv(y,A,x,row,column);
}

...全文