cuda程序运行速度比较慢,请问代码哪里有问题,可以进一步优化?

hua田木又寸下 2016-12-14 11:40:03

下面是平台测试的输出
device name: GK20A
computer ability: 3.2
MaxGridSize: 2147483647,65535,65535
MaxThreadPerBlock: 1024
MaxThreadDim: 1024,1024,64
multiProcessorCount: 1
resPerBlock: 32768(K)
sharedMemoryPerBlock(K): 48
totalGlobalMemory: 1892(M)
warpSize: 32
ConstanMemory: 64(K)
程序计算速度测试
系统平台 ubuntu
硬件平台:嵌入式 nvidia Tegra K1 http://www.nvidia.cn/object/tegra-k1-processor-cn.html
GPU
getCensus time=441.06 ms
TransposeDSI time=1205.99 ms
CPU
time of census: 346 ms
time of TransposeDSI: 44 ms

本人GPU编程新手,计算结果速度非常慢。程序实现是图像处理的计算,一共两个函数,里面的两个函数GPU实现耗时远大于CPU实现,请问问题出在哪里?是实现有问题还是配置问题?还是平台有什么特殊设置吗?麻烦将我的问题都指出来,代码贴在下面了,如果需要看源代码的麻烦你留下邮箱~先谢谢各位了!
代码如下:

main.cpp

extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi);
extern "C"
void cudaInit();
//#define USE_CUDA

void StereoTest(void)
{
const int disp_num = 64;
cudaInit();
Mat imgLeft = imread("1-L.png",0);
Mat imgRight= imread("1-R.png",0);
cv::Size img_size;
img_size.width = 1024;
img_size.height = 400;
cv::resize(imgLeft, imgLeft, img_size);
cv::resize(imgRight, imgRight, img_size);
int height = imgLeft.rows;
int width = imgLeft.cols;
u16 *dsi =(u16 *)malloc(sizeof(int)*width*height*disp_num);
u16 *dsiAgg=(u16 *)malloc(sizeof(int)*width*height*disp_num);
cout << "runing..." <<endl;

#ifdef USE_CUDA
memset(dsiAgg, 0, height * width * disp_num* sizeof(unsigned short));
cudaCensus(imgLeft, imgRight, dsiAgg);
#else
clock_t start = clock();
Census((uchar *)imgLeft.data, (uchar *)imgRight.data, height, width, disp_num, dsi);
printf("time of census: %.3f s\n", (double)(clock()-start)/CLOCKS_PER_SEC);

clock_t TransposeDSI_clock = clock();
TransposeDSI(width, height, disp_num, dsiAgg, dsi);
printf("time of TransposeDSI: %.3f s\n", (double)(clock()-TransposeDSI_clock)/CLOCKS_PER_SEC);

printf("total time: %.3f s\n\n\n", (double)(clock()-start)/CLOCKS_PER_SEC);
#endif

free(dsi);
free(dsiAgg);
}

int main(int argc, char *argv[])
{
StereoTest();
return 0;
}


stereo.cpp

#include "stereo.h"
inline uint64 getCensus(uchar* source, int width, int i, int j)
{
uint64 value = 0;
uchar center = source[i*width + j];
for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
value = (value << 1);
value += (source[(i+m)*width+j+n] < center);
}
}
return value;
}
uchar hamDist16(ushort x, ushort y)
{
uchar dist = 0;
ushort val = (x^y); // yihuo
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}
void Census(uchar *left, uchar *right, int height, int width, int dispnum, u16 *dsi)
{
static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
uint64 *leftCensus = new uint64[width];
uint64 *rightCensus = new uint64[width];
if(firstRun)
{
for (int i = 0; i < 65536; i++) {
popcount16LUT[i] = hamDist16(i,0);
}

memset(dsi,0,height*width*dispnum*sizeof(u16));
firstRun = false;
}
for (int i = 3; i < height - 3; i++)
{
for (int j = 4; j < width - 4; j++)
{
leftCensus[j] = getCensus(left, width, i, j);
rightCensus[j] = getCensus(right, width, i, j);
}
// Hamming Distance
for (int j = 4; j < width - 4; j++)
{
for (int d = 0; d < dispnum; d++)
{
if( d <= j - 4)
{
// total 44 bytes
dsi[(i*width+j)*dispnum + d] = popcount16LUT[ (ushort)(leftCensus[j] ^ rightCensus[j-d]) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>16) ^ (rightCensus[j-d]>>16)) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>32) ^ (rightCensus[j-d]>>32)) ];
}
else
{
dsi[(i*width+j)*dispnum + d] = 0xff;
}
}
}
}
delete [] leftCensus;
delete [] rightCensus;
}
void TransposeDSI(int width, int height, int n_disp, u16 *dsiSrc, u16 *dsiDes)
{
int index1, index2;
for(int j=0;j<height;j++)
{
for(int i=0;i<width;i++)
{
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
memcpy(dsiDes+index2, dsiSrc+index1, sizeof(u16)*n_disp);
}
}
}



kernel.cu 文件

static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
static const int dispnum = 64;

static unsigned char hamDist_16(ushort x, ushort y)
{
unsigned char dist = 0;
unsigned short val = (x^y);
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}

__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{
int i = threadIdx.x;
int j = blockIdx.x;
int index1, index2;
int d;
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
dsiDes += index2;
dsiSrc += index1;
for(d = 0; d < n_disp; d++) {
dsiDes[d] = dsiSrc[d];
dsiSrc[d] = 0;
}
//memcpy(dsiDes+index2, dsiSrc+index1, sizeof(unsigned short)*n_disp);
}

__global__ void getCensus(unsigned char *left_data, unsigned char *right_data,
unsigned char *popcount16LUT, unsigned short *dsi,
int height, int width)
{
__shared__ unsigned long long int leftCensus[1024];
__shared__ unsigned long long int rightCensus[1024];
unsigned long long int left_val = 0;
unsigned long long int right_val = 0;
unsigned long long int val_l;
unsigned long long int val_r;
int tid = threadIdx.x;
int bid = blockIdx.x;
if(tid >= 4 && bid >= 3) {
unsigned char left_center = left_data[bid*width + tid];
unsigned char right_center = right_data[bid*width + tid];

for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
left_val = (left_val << 1);
left_val += (left_data[(bid+m) * width + tid + n] < left_center);
right_val = (right_val << 1);
right_val += (right_data[(bid+m) * width + tid + n] < right_center);
}
}
leftCensus[tid] = left_val;
rightCensus[tid] = right_val;

__syncthreads();
for (int d = 0; d < 64; d++)
{
if( d <= tid - 4)
{
val_l = leftCensus[tid];
val_r = rightCensus[tid-d];
// total 44 bytes
dsi[(bid*width+tid)*64 + d] = popcount16LUT[ (unsigned short)(val_l ^ val_r)]
+ popcount16LUT[(unsigned short)((val_l>>16) ^ (val_r>>16))]
+ popcount16LUT[(unsigned short)((val_l>>32) ^ (val_r>>32))];
}
else
{
dsi[(bid*width+tid)*64 + d] = 0xff;
}
}
}
else {
leftCensus[tid] = 0;
rightCensus[tid] = 0;
}

}

uchar *left_data, *right_data;
float *disp_img;
unsigned short *out_dist;
unsigned short *dist_agg;

extern "C"
void cudaInit()
{
HANDLE_ERROR(cudaMalloc( (void**)&left_data, WIDTH * HEIGHT * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&right_data, HEIGHT * WIDTH * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&out_dist, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&dist_agg, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&disp_img, WIDTH * HEIGHT *sizeof(float)));
if(firstRun)
{
HANDLE_ERROR(cudaMalloc( (void**)&popcount16LUT, 65536));
static uchar *tmp_popcount16LUT = new uchar[65536];
// build the popcount16LUT
for (int i = 0; i < 65536; i++) {
tmp_popcount16LUT[i] = hamDist_16(i,0);
}
HANDLE_ERROR(cudaMemcpy(popcount16LUT, tmp_popcount16LUT, 65536, cudaMemcpyHostToDevice));
firstRun = false;
}
}

extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi) {
int height = left_img.rows;
int width = left_img.cols;
int data_size = height * width * sizeof(uchar);
HANDLE_ERROR(cudaMemcpy(left_data, left_img.data, data_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(right_data, right_img.data, data_size, cudaMemcpyHostToDevice));

cudaEvent_t getCensus_start, getCensus_stop;
cudaEventCreate(&getCensus_start);
cudaEventCreate(&getCensus_stop);
cudaEventRecord(getCensus_start, NULL);
getCensus<<<height, width>>>(left_data, right_data, popcount16LUT, out_dist, height, width);
cudaDeviceSynchronize();
cudaEventRecord(getCensus_stop, NULL);
cudaEventSynchronize(getCensus_stop);
float getCensus_time = 0.0f;
cudaEventElapsedTime(&getCensus_time, getCensus_start, getCensus_stop);
cudaEventDestroy(getCensus_start);
cudaEventDestroy(getCensus_stop);
std::cout << " getCensus time=" << getCensus_time<<std::endl;
/* TransposeDSI */
cudaEvent_t TransposeDSI_1_start, TransposeDSI_1_stop;
cudaEventCreate(&TransposeDSI_1_start);
cudaEventCreate(&TransposeDSI_1_stop);
cudaEventRecord(TransposeDSI_1_start, NULL);
TransposeDSI<<<height, width>>>(out_dist, dist_agg, width, height, dispnum);
cudaDeviceSynchronize();
cudaEventRecord(TransposeDSI_1_stop, NULL);
cudaEventSynchronize(TransposeDSI_1_stop);
float TransposeDSI_1_time = 0.0f;
cudaEventElapsedTime(&TransposeDSI_1_time, TransposeDSI_1_start, TransposeDSI_1_stop);
cudaEventDestroy(TransposeDSI_1_start);
cudaEventDestroy(TransposeDSI_1_stop);
std::cout << " TransposeDSI time=" << TransposeDSI_1_time<<std::endl;
HANDLE_ERROR(cudaMemcpy(dsi, dist_agg, width * height *sizeof(unsigned short), cudaMemcpyDeviceToHost));
return 0;
}
...全文
2631 2 打赏 收藏 转发到动态 举报
写回复
用AI写文章
2 条回复
切换为时间正序
请发表友善的回复…
发表回复
xxiaoccen 2016-12-23
  • 打赏
  • 举报
回复 1
我不是专家。。。 不过你的代码应该有比较明显的问题吧 cuda需要注意很多问题 1. 首先访问全局内存也就是显存的时候,线程需要满足合并访问。 2. 要尽可能减少从全局内存中读取和写入数据。 3. 要充分利用共享内存。 4. 你的显卡的warp size是32,就是说每次是32个线程同时发射指令,同一个warp里面要减少分支,如果同一个warp里面有条件语句,那么这个warp是实际上是顺序执行,而不是并行的。 5. 然后使用共享内存要注意bank conflict。 就说getCences这个kernel吧 首先第一个if语句里面的两层for循环每一次都要从全局内存里读取数据,这个非常费时间,你应该同一个block里面的线程首先协同的把全局内存里的数据块读到共享存储里面,然后在对共享存储里的数据做两重for循环。 同理第二个for循环也有一些问题。。。 transposeDSI也是,每个线程做了n_disp次读取,每次都是从全局存储读数据,写数据,必然不会快。。。 你处理的是图像数据吧,感觉你用二维的线程块,二维的线程网格会比较合适。
hua田木又寸下 2016-12-14
  • 打赏
  • 举报
回复
*.pro文件 TEMPLATE = app CONFIG += console c++11 CONFIG -= app_bundle CONFIG -= qt CONFIG += link_pkgconfig PKGCONFIG += opencv INCLUDEPATH += \ /usr/include SOURCES += main.cpp \ stereo.cpp \ HEADERS += \ stereo.h \ book.h CUDA_SOURCES += kernel.cu QMAKE_LIBDIR += /usr/local/cuda/lib INCLUDEPATH += /usr/include INCLUDEPATH += /usr/include/opencv INCLUDEPATH += /usr/include/opencv2 INCLUDEPATH += /usr/local/cuda/include LIBS += `pkg-config opencv --cflags --libs` LIBS += -lcudart -lcuda # This makes the .cu files appear in your project OTHER_FILES += ./kernel.cu # CUDA settings <-- may change depending on your system CUDA_SOURCES += ./kernel.cu CUDA_SDK = "/usr/local/cuda" # Path to cuda SDK install CUDA_DIR = "/usr/local/cuda" # Path to cuda toolkit install # DO NOT EDIT BEYOND THIS UNLESS YOU KNOW WHAT YOU ARE DOING.... SYSTEM_NAME = unix # Depending on your system either 'Win32', 'x64', or 'Win64' SYSTEM_TYPE = 32 # '32' or '64', depending on your system CUDA_ARCH = sm_21 # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10' NVCC_OPTIONS = --use_fast_math # include paths INCLUDEPATH += $$CUDA_DIR/include # library directories QMAKE_LIBDIR += $$CUDA_DIR/lib/ CUDA_OBJECTS_DIR = ./ # Add the necessary libraries CUDA_LIBS = -lcuda -lcudart # The following makes sure all path names (which often include spaces) are put between quotation marks CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"') #LIBS += $$join(CUDA_LIBS,'.so ', '', '.so') LIBS += $$CUDA_LIBS # Configuration of the Cuda compiler CONFIG(debug, debug|release) { # Debug mode cuda_d.input = CUDA_SOURCES cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} cuda_d.dependency_type = TYPE_C QMAKE_EXTRA_COMPILERS += cuda_d } else { # Release mode cuda.input = CUDA_SOURCES cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} cuda.dependency_type = TYPE_C QMAKE_EXTRA_COMPILERS += cuda }

603

社区成员

发帖
与我相关
我的任务
社区描述
异构开发技术
社区管理员
  • OpenCL和异构编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧