cuda的kernel函数中能调用自定义的结构体吗
我在计算中经常要使用复数,以前用C++的时候都是用的复数模板,现在用cuda不知道是否支持,所以我就自建了一个复数结构体并定义了几个简单的复数运算,做了一个程序测试一下。
程序就是计算一个复数数组中的树的平方和,编译的时候编译器提示出现错误终止,不知道原因何在?我的系统是win7 x64旗舰版,在VS2008下用cuda3.2的toolkit编译的,配置安装没有问题,以前做的几个简单的例子都可以用。源代码如下
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define DATA_SIZE 1048576
#define BLOCK_NUM 32
#define THREAD_NUM 256
struct Complex{
float r,i;
Complex(float _x, float _y);
Complex(const Complex &c);
Complex();
~Complex(){}
Complex operator*(const Complex &c)const;
Complex operator=(const Complex &c);
Complex operator+(const Complex &c)const;
Complex operator+=(const Complex &c);
};
Complex::Complex(float _x, float _y)
{
r=_x; i=_y;
}
Complex::Complex(const Complex &c)
{
r=c.r; i=c.i;
}
Complex::Complex()
{
r=0.0; i=0.0;
}
Complex Complex::operator*(const Complex & c)const
{
return Complex(r*c.r-i*c.i,r*c.i+i*c.r);
}
Complex Complex::operator=(const Complex & c)
{
r=c.r; i=c.i;
return *this;
}
Complex Complex::operator+(const Complex &c)const
{
return Complex(r+c.r, r+c.i);
}
Complex Complex::operator+=(const Complex &c)
{
r += c.r; i += c.i;
return *this;
}
void GenerateNumber(Complex* number, int size)
{
for(int i=0; i<size; i++)
number[i] = Complex(i*1.0,(i+1)*1.0);
}
__global__ static void sumOfSquares(Complex *num, Complex* result, clock_t* time)
{
extern __shared__ Complex shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
int offset;
if(tid == 0)time[bid] = clock();
shared[tid] = Complex(0.0,0.0);
for(i = bid*THREAD_NUM+tid; i<DATA_SIZE;
i += BLOCK_NUM*THREAD_NUM)
shared[tid] += num[i]*num[i];
__syncthreads();
offset = THREAD_NUM / 2;
while(offset>0)
{
if(tid<offset)
shared[tid] += shared[tid+offset];
offset >>= 1;
__syncthreads();
}
if(tid==0)
{
result[bid] = shared[0];
if(tid==0)time[bid+BLOCK_NUM] = clock();
}
}
Complex data[DATA_SIZE];
int main()
{
GenerateNumber(data, DATA_SIZE);
Complex *gpudata, *result;
clock_t *time;
cudaMalloc((void**)&gpudata, sizeof(Complex)*DATA_SIZE);
cudaMalloc((void**)&result, sizeof(Complex)*BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM*2);
cudaMemcpy(gpudata, data, sizeof(Complex)*DATA_SIZE, cudaMemcpyHostToDevice);
sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM*sizeof(Complex)>>>
(gpudata, result, time);
Complex sum[BLOCK_NUM];
clock_t time_used[BLOCK_NUM*2];
cudaMemcpy(&sum, result, sizeof(Complex)*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t)*BLOCK_NUM*2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
Complex final_sum;
for(int i=0; i<BLOCK_NUM; i++)
final_sum += sum[i];
clock_t min_start, max_end;
min_start = time_used[0];
max_end = time_used[BLOCK_NUM];
for(int i=1; i<BLOCK_NUM; i++)
{
if(min_start > time_used[i])
min_start = time_used[i];
if(max_end < time_used[i+BLOCK_NUM])
max_end = time_used[i+BLOCK_NUM];
}
printf("sum: (%f , %f) time: %d\n", final_sum.r, final_sum.i , max_end-min_start);
return 0;
}
在给kernel函数传参的时候我用的是自定义的Complex结构体,kernel函数中复数的乘法和加法运算,我是调用的外部的定义的重载操作符,不知道是不是这个的问题。
应该如何修改,请赐教