推荐一个关于CUDA编程好东西(一)

_梦魇花葬 2014-07-27 08:53:37
加精
引用
http://blog.csdn.net/augusdi/article/details/12434077

1、__constant__和__device__,__shared__的使用说明
其对应的程序:
//
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
//__constant__intdevice_global_var=5;
//__device__intdevice_global_var=5;
__shared__intdevice_global_var;
__global__voidkernel()
{
__shared__intxx;
}
intmain()
{
inthost_var=5;
cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int));
printf("value=%d\n",host_var);
cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int));
printf("device_value=%d\n",host_var);
system("pause");
return0;
}

2、分配二位数组实现两个二位数组相加
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
#defineN16
__device__intdevice_a[N][N],device_b[N][N],device_c[N][N];
__global__voidVecAdd(inta[N][N],intb[N][N],intc[N][N])
{
intglobal_threadId_x=blockIdx.x*blockDim.x+threadIdx.x;
intglobal_threadId_y=blockIdx.y*blockDim.y+threadIdx.y;
if(global_threadId_x<N&&global_threadId_y<N)
{
c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+
b[global_threadId_y][global_threadId_x];
}
}
voidprintfArray(intdata[N][N])
{
for(inti=0;i<N;i++)
{
for(intj=0;j<N;j++)
{
printf("%d",data[i][j]);
}
printf("\n");
}
}
voidhost_Add(inta[N][N],intb[N][N],intc[N][N])
{
for(inti=0;i<N;i++)
{
for(intj=0;j<N;j++)
{
c[i][j]=a[i][j]+b[i][j];
}
}
}
intmain()
{
inti,j;
intk=0;
inta[N][N],b[N][N];
intc[N][N];
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
a[i][j]=k;
b[i][j]=k;
k++;
}
}
inttempA[N][N];
//int(*device_aa)[N];
int**device_aa;
cudaMalloc((void**)&device_aa,sizeof(int)*N*N);
cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N);
cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N);
printf("tempA====\n");
printfArray(tempA);
system("pause");
return0;
}

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝
代码:

#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
#include<iostream>
//kernelwhichcopiesdatafromd_arraytodestinationArray
__global__voidCopyData(float*d_array,
float*destinationArray,
size_tpitch,
intcolumnCount,
introwCount)
{
for(introw=0;row<rowCount;row++)
{
//updatethepointertopointtothebeginningofthenextrow
float*rowData=(float*)(((char*)d_array)+(row*pitch));
for(intcolumn=0;column<columnCount;column++)
{
rowData[column]=123.0;//makeeveryvalueinthearray123.0
destinationArray[(row*columnCount)+column]=rowData[column];
}
}
}
intmain(intargc,char**argv)
{
intcolumnCount=15;
introwCount=10;
float*d_array;//thedevicearraywhichmemorywillbeallocatedto
float*d_destinationArray;//thedevicearray
//allocatememoryonthehost
float*h_array=newfloat[columnCount*rowCount];
//thepitchvalueassignedbycudaMallocPitch
//(whichensurescorrectdatastructurealignment)
size_tpitch;
//allocatedthedevicememoryforsourcearray
cudaMallocPitch(&d_array,&pitch,columnCount*sizeof(float),rowCount);
//allocatethedevicememoryfordestinationarray
cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float));
//callthekernelwhichcopiesvaluesfromd_arraytod_destinationArray
CopyData<<<100,512>>>(d_array,d_destinationArray,pitch,columnCount,rowCount);
//copythedatabacktothehostmemory
float*h_result=(float*)malloc(sizeof(float)*columnCount*rowCount);
memset(h_result,0,sizeof(float)*columnCount*rowCount);
cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost);
cudaMemcpy(h_array,
d_destinationArray,
columnCount*rowCount*sizeof(float),
cudaMemcpyDeviceToHost);
for(inti=0;i<rowCount;i++)
{
for(intj=0;j<columnCount;j++)
{
cout<<"h_result["<<(i*columnCount)+j<<"]="<<h_result[(i*columnCount)+j]<<endl;
}
}
system("pause");
printf("h_array==\n");
//printoutthevalues(allthevaluesare123.0)
for(inti=0;i<rowCount;i++)
{
for(intj=0;j<columnCount;j++)
{
cout<<"h_array["<<(i*columnCount)+j<<"]="<<h_array[(i*columnCount)+j]<<endl;
}
}
system("pause");
}

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法
代码:

#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdlib.h>
#include<stdio.h>
//Devicecode
__global__voidMyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtentextent)
{
char*devPtr=(char*)devPitchedPtr.ptr;
size_tpitch=devPitchedPtr.pitch;
size_tslicePitch=pitch*extent.height;
for(intk=0;k<extent.depth;k++){
char*slice=devPtr+k*slicePitch;
for(intj=0;j<extent.height;j++){
float3*row=(float3*)(slice+j*pitch);
for(inti=0;i<extent.width;i++)
{
row[i].x=2;
row[i].y=3;
row[i].z=4;
}
}
}
}
constintx=6;
constinty=60;
constintz=66;
intmain(){
size_tbuf_pf=900000000;
//cudaPrintfInit(buf_pf);
cudaError_tstatus=cudaSuccess;
//========MemHost
float3*mem_host=(float3*)malloc(sizeof(float3)*x*y*z);
float3*mem_host2=(float3*)malloc(sizeof(float3)*x*y*z);
for(inti=0;i<x*y*z;i++){
mem_host[i].x=10;
mem_host[i].y=100;
mem_host[i].z=1000;
}
//========MemDevice
cudaExtentextent;
extent.width=x*sizeof(float3);
extent.height=y;
extent.depth=z;
cudaPitchedPtrmem_device;
status=cudaMalloc3D(&mem_device,extent);
//if(status!=cudaSuccess){fprintf(stderr,"Malloc:%s\n",cudaGetErrorString(status));}
////========CpyHostToDevice
//cudaMemcpy3DParmsp={0};
//p.srcPtr=make_cudaPitchedPtr((void*)mem_host,x*sizeof(float3),x,y);
//p.dstPtr=mem_device;
//p.extent=extent;
//p.kind=cudaMemcpyHostToDevice;
//status=cudaMemcpy3D(&p);
//if(status!=cudaSuccess){fprintf(stderr,"MemcpyHtD:%s\n",cudaGetErrorString(status));}
MyKernel<<<1,1>>>(mem_device,extent);
//========CpyDeviceToHost!!!!!!!UNTESTED!!!!!!!!
cudaMemcpy3DParmsq={0};
q.srcPtr=mem_device;
q.dstPtr=make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y);
q.extent=extent;
q.kind=cudaMemcpyDeviceToHost;
status=cudaMemcpy3D(&q);
if(status!=cudaSuccess){fprintf(stderr,"MemcpyDtoH:%s\n",cudaGetErrorString(status));}
for(inti=0;i<x*y*z;i++)
printf("%f%f%f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z);
cudaFree(mem_device.ptr);
system("pause");
}

5、不带共享存储器的矩阵的相乘
代码:

#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
typedefstruct
{
intwidth;
intheight;
float*element;
}Matrix;
#defineBLOCK_SIZE16
__global__voidMatMulKernel(constMatrix,constMatrix,Matrix);
voidprintMatrix(constMatrix&A)
{
for(inti=0;i<A.height;i++)
{
for(intj=0;j<A.width;j++)
{
printf("%f",A.element[i*A.width+j]);
}
printf("\n");
}
}
voidMatMul(constMatrix&A,constMatrix&B,Matrix&C)
{
printf("matrixA");
printMatrix(A);
printf("matrixB");
printMatrix(B);
system("pause");
Matrixd_A;
d_A.width=A.width;
d_A.height=A.height;
size_tsize=A.width*A.height*sizeof(float);
cudaMalloc(&d_A.element,size);
cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice);
Matrixd_B;
d_B.width=B.width;
d_B.height=B.height;
size=B.width*B.height*sizeof(float);
cudaMalloc(&d_B.element,size);
cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice);
Matrixd_C;
d_C.width=C.width;
d_C.height=C.height;
size=C.width*C.height*sizeof(float);
cudaMalloc(&d_C.element,size);
dim3dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost);
cudaFree(d_A.element);
cudaFree(d_B.element);
cudaFree(d_C.element);
for(inti=0;i<C.height;++i)
{
for(intj=0;j<C.width;++j)
{
printf("%f",C.element[i*C.width+j]);
}
printf("\n");
}
system("pause");
}
voidSetMatrixValue(Matrix&A,intvalue)
{
for(inti=0;i<A.height;++i)
{
for(intj=0;j<A.width;++j)
{
A.element[i*A.width+j]=value;
}
}
}
voidmain()
{
MatrixA,B,C;
A.width=128;
A.height=128;
A.element=(float*)malloc(A.width*A.height*sizeof(float));
SetMatrixValue(A,2);
B.width=128;
B.height=128;
B.element=(float*)malloc(B.width*B.height*sizeof(float));
//memset(B.element,2,sizeof(float)*B.width*B.height);
SetMatrixValue(B,2);
C.width=128;
C.height=128;
C.element=(float*)malloc(C.width*C.height*sizeof(float));
//memset(C.element,2,sizeof(float)*C.width*C.height);
MatMul(A,B,C);
for(inti=0;i<C.height;++i)
{
for(intj=0;j<C.width;++j)
{
printf("%f",C.element[i*C.width+j]);
}
printf("\n");
}
system("pause");
}
__global__voidMatMulKernel(MatrixA,MatrixB,MatrixC)
{
floatCValue=0;
introw=blockIdx.y*blockDim.y+threadIdx.y;
intcol=blockIdx.x*blockDim.x+threadIdx.x;
for(inte=0;e<A.width;++e)
{
CValue+=A.element[row*A.width+e]*B.element[e*B.width+col];
}
C.element[row*C.width+col]=CValue;
}

...全文
3027 13 打赏 收藏 转发到动态 举报
写回复
用AI写文章
13 条回复
切换为时间正序
请发表友善的回复…
发表回复
Niclezhang 2014-09-29
  • 打赏
  • 举报
回复
话说怎么调用相加的函数呢?
	int tempC[N][M];
	dim3 block(N, M);
	VecAdd<<<1, block>>>(device_a, device_b, device_c);

	cudaMemcpyFromSymbol(tempC,device_c,sizeof(int)*N*M);
	printfArray(tempC);
结果全是随机数,大神教一下,刚接触cuda,想找个例子见识下
Niclezhang 2014-09-28
  • 打赏
  • 举报
回复
菜鸟不懂,楼主不要介意
Niclezhang 2014-09-28
  • 打赏
  • 举报
回复
总觉得两个二维数组相加那个相加结果不对。
_梦魇花葬 2014-09-06
  • 打赏
  • 举报
回复
是的,你可以试一试
bingbingzhe 2014-08-15
  • 打赏
  • 举报
回复
这中方法比一位数组实现性能要高吗
diandiandj12 2014-08-04
  • 打赏
  • 举报
回复
学校学习学习学学
YCMyTot 2014-08-01
  • 打赏
  • 举报
回复
学习了!谢谢分享了!
卖水果的net 2014-07-28
  • 打赏
  • 举报
回复
支持新知识。
云满笔记 2014-07-28
  • 打赏
  • 举报
回复
支持新东西
s549836194 2014-07-28
  • 打赏
  • 举报
回复
好,精品,学习到了
line_us 2014-07-27
  • 打赏
  • 举报
回复
支持分享! 沙发首占

580

社区成员

发帖
与我相关
我的任务
社区描述
CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。
社区管理员
  • CUDA编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧