推荐一个CUDA编程的好东西(二)

_梦魇花葬 2014-07-27 08:56:25
加精
引用
http://blog.csdn.net/augusdi/article/details/12434077

7、页锁定主机存储器Page-lockedHostmemory
代码:

#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
#defineN256
voidSetArrayValue(float*pData,intsize,floatvalue)
{
for(inti=0;i<size;++i)
{
pData[i]=value;
}
}
__global__voidVecAdd(float*A,float*B,float*C)
{
intindex=threadIdx.x;
C[index]=A[index]+B[index];
}
voidPrintArray(float*data,intsize)
{
for(inti=0;i<size;++i)
{
if((i+1)%10==0)
{
printf("\n");
}
printf("%f",data[i]);
}
}
voidmain()
{
//cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无
cudaDevicePropdeviceProp;
cudaGetDeviceProperties(&deviceProp,0);
if(deviceProp.integrated)
{
printf("GPUisintegrated\n");
return;
}
if(!deviceProp.canMapHostMemory)
{
printf("can'tmaphostmemory\n");
return;
}
float*hostPtrA;
cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);
SetArrayValue(hostPtrA,N,29);
PrintArray(hostPtrA,N);
system("pause");
float*hostPtrB;
cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);
SetArrayValue(hostPtrB,N,31);
float*devPtrA,*devPtrB;
cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);
cudaHostGetDevicePointer(&devPtrB,hostPtrB,0);
float*hostPtrC;
cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);
float*devPtrC;
cudaHostGetDevicePointer(&devPtrC,hostPtrC,0);
VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC);
cudaDeviceSynchronize();
for(inti=0;i<N;i++)
{
if((i+1)%10==0)
{
printf("\n");
}
printf("%f",hostPtrC[i]);
}
system("pause");
}
8、纹理存储的使用texturememory
代码:#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
#definesize256
texture<float,cudaTextureType2D,cudaReadModeElementType>texRef;
__global__voidtransformKernel(float*output,intwidth,intheight,floattheta)
{
unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;
unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;
floatu=x/(float)width;
floatv=y/(float)height;
u-=0.5f;
v-=0.5f;
floattu=u*cosf(theta)-v*sin(theta)+0.5f;
floattv=v*cosf(theta)+u*sinf(theta)+0.5f;
output[y*width+x]=tex2D(texRef,tu,tv);
}
voidmain()
{
intwidth=25,height=25;
cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
cudaArray*cuArray;
cudaMallocArray(&cuArray,&channelDesc,width,height);
float*h_data=(float*)malloc(width*height*sizeof(float));
for(inti=0;i<height;++i)
{
for(intj=0;j<width;++j)
{
h_data[i*width+j]=i*width+j;
}
}
cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice);
texRef.addressMode[0]=cudaAddressModeWrap;
texRef.addressMode[1]=cudaAddressModeWrap;
texRef.filterMode=cudaFilterModeLinear;
texRef.normalized=true;
cudaBindTextureToArray(texRef,cuArray,channelDesc);
float*output;
cudaMalloc(&output,width*height*sizeof(float));
dim3dimBlock(16,16);
dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);
floatangle=30;
transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle);
float*hostPtr=(float*)malloc(sizeof(float)*width*height);
cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost);
for(inti=0;i<height;++i)
{
for(intj=0;j<width;++j)
{
printf("%f",hostPtr[i*width+j]);
}
printf("\n");
}
free(hostPtr);
cudaFreeArray(cuArray);
cudaFree(output);
system("pause");
}

9、surfaceMemory的使用方法
代码:

#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<stdio.h>
#include<stdlib.h>
surface<void,2>inputSurfRef;
surface<void,2>outputSurfRef;
__global__voidcopyKernel(intwidth,intheight)
{
unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;
unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;
if(x<width&&y<height)
{
uchar4data;
surf2Dread(&data,inputSurfRef,x*4,y);
surf2Dwrite(data,outputSurfRef,x*4,y);
}
}
voidmain()
{
intwidth=256,height=256;
unsignedint*h_data=(unsignedint*)malloc(width*height*sizeof(unsignedint));
for(inti=0;i<height;++i)
{
for(intj=0;j<width;++j)
{
h_data[i*width+j]=3;
}
}
intsize=width*height*sizeof(unsignedint);
cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);
cudaArray*cuInputArray;
cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArrayzurfaceLoadStore);
cudaArray*cuOutputArray;
cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);
cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice);
cudaBindSurfaceToArray(inputSurfRef,cuInputArray);
cudaBindSurfaceToArray(outputSurfRef,cuOutputArray);
dim3dimBlock(16,16);
dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);
copyKernel<<<dimGrid,dimBlock>>>(width,height);
unsignedint*host_output=(unsignedint*)malloc(sizeof(unsignedint)*width*height);
cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost);
for(inti=0;i<height;++i)
{
for(intj=0;j<width;++j)
{
printf("%u",host_output[i*width+j]);
}
printf("\n");
}
system("pause");
free(host_output);
free(h_data);
cudaFreeArray(cuInputArray);
cudaFreeArray(cuOutputArray);
}

10、opengl和cuda的交互
代码:
引用
https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/
http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7


#include<cuda.h>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<gl/glew.h>//要放在下面这一句的前面
#include"cuda_gl_interop.h"
#include<stdio.h>
#include<stdlib.h>
//#include<gl/GL.h>
#include<gl/glut.h>
GLuintpostionsVBO=1;
structcudaGraphicsResource*postionsVBO_CUDA;
intwidth=256;
intheight=256;
__device__floatdev_time=1;
floathost_time=1;
__global__voidcreateVertices(float4*positions,floattime,unsignedintwidth,unsignedintheight);
voidinit()
{
glClearColor(0.f,0.f,0.f,1.f);
glClear(GL_DEPTH_BUFFER_BIT|GL_COLOR_BUFFER_BIT);
}
voidreshape(intwidth,intheight)
{
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
if(width>height)
{
gluPerspective(45,(GLfloat)width/height,0.001,1000);
}else
{
gluPerspective(45,(GLfloat)height/width,0.001,1000);
}
glMatrixMode(GL_MATRIX_MODE);
glLoadIdentity();
}
voiddisplay()
{
float4*positions;
cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);
size_tnumb_bytes;
cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA);
dim3dimBlock(16,16,1);
dim3dimGrid(width/dimBlock.x,height/dimBlock.y,1);
createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height);
dev_time++;
cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice);
cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0);
glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);
glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
glVertexPointer(4,GL_FLOAT,0,0);
glEnableClientState(GL_VERTEX_ARRAY);
glDrawArrays(GL_POINTS,0,width*height);
glDisableClientState(GL_VERTEX_ARRAY);
glutSwapBuffers();
glutPostRedisplay();
}
__global__voidcreateVertices(float4*positions,floattime,unsignedintwidth,unsignedintheight)
{
unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;
unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;
floatu=x/(float)width;
floatv=y/(float)height;
u=u*2.f-1.f;
v=v*2.f-1.f;
floatfreq=4.f;
floatw=sinf(u*freq+time)*cosf(v*freq+time)*0.5f;
positions[y*width+x]=make_float4(u,w,v,1.f);
}
intmain(intargc,char*argv[])
{
cudaGLSetGLDevice(0);
glutInit(&argc,argv);
glutInitDisplayMode(GLUT_DOUBLE|GLUT_RGB);
glutInitWindowPosition(0,0);
glutInitWindowSize(100,100);
glutCreateWindow("opengl-cuda");
init();
glutDisplayFunc(display);
glutReshapeFunc(reshape);
glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010
glGenBuffers(1,&postionsVBO);
glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);
unsignedintsize=width*height*4*sizeof(float);
glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER,0);
cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard);
glutMainLoop();
}

11、Formattedoutput---printf函数在device的函数中,但是其需要其的computecopability至少为2.0
代码:

#include<cuda.h>
#include<helper_cuda.h>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
__global__voidbcast()
{
printf("%d\n",threadIdx.x);
}
voidmain()
{
bcast<<<1,32>>>();
cudaDeviceSynchronize();
system("pause");
}

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0代码:

#include<cuda.h>
#include<helper_cuda.h>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
#include<assert.h>
#include<stdlib.h>
__global__voidtestAssert(void)
{
intis_one=1;
intshould_be_one=0;
assert(is_one);
assert(should_be_one);
}
voidmain()
{
testAssert<<<1,1>>>();
cudaDeviceSynchronize();
cudaDeviceReset();
system("pause");
}

13、PerThreadAllocationOnheap每个线程在堆上分配
代码:

#include<stdio.h>
#include<stdlib.h>
#include<cuda.h>
#include<helper_cuda.h>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
__global__voidmallocTestPerThread()
{
char*ptr=(char*)malloc(100);
printf("Thread%dgotpointer:%p\n",threadIdx.x,ptr);
free(ptr);
}
intmain()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
mallocTestPerThread<<<1,5>>>();
cudaDeviceSynchronize();
system("pause");
return0;
}

14、PerThreadBlockAllocation每个线程块在堆上分配空间
代码:

#include<stdio.h>
#include<stdlib.h>
#include<cuda.h>
#include<helper_cuda.h>
#include"cuda_runtime.h"
#include"device_launch_parameters.h"
__global__voidmallocTestPerThreadBlock()
{
__shared__int*data;
if(threadIdx.x==0)
{
data=(int*)malloc(blockDim.x*64);
}
__syncthreads();
if(data==NULL)
{
return;
}
int*ptr=data;
for(inti=0;i<64;++i)
{
ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;
}
__syncthreads();
if(threadIdx.x==0)
{
free(data);
}
}
intmain()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);
mallocTestPerThreadBlock<<<10,128>>>();
cudaDeviceSynchronize();
system("pause");
return0;
}

...全文
3354 14 打赏 收藏 转发到动态 举报
写回复
用AI写文章
14 条回复
切换为时间正序
请发表友善的回复…
发表回复
安静橘子 2016-07-30
  • 打赏
  • 举报
回复
楼主,我在JetsongTK1上做开发,写的内核函数,用cudamemcpy运行下来的1ms左右,但是用cudaHostGetDevicePointer这种方法却变成了 6ms左右,你有没有遇到过类似情况 ?
安静橘子 2016-07-30
  • 打赏
  • 举报
回复
谢谢
beyondcj 2014-08-02
  • 打赏
  • 举报
回复
伊顺鸣 2014-08-01
  • 打赏
  • 举报
回复
什么do,。。。。。。
_周星星 2014-07-31
  • 打赏
  • 举报
回复
伊顺鸣 2014-07-31
  • 打赏
  • 举报
回复
不错的啊。。。。
一包苏烟 2014-07-31
  • 打赏
  • 举报
回复
哦接啊接啊接啊姐姐啊经济案件的SD卡你说放开你的那个, 每个,单方面,
副组长 2014-07-31
  • 打赏
  • 举报
回复
就是不知道N卡还能走多远。
cattpon 2014-07-28
  • 打赏
  • 举报
回复
试用一下后在说~
云满笔记 2014-07-28
  • 打赏
  • 举报
回复
新东西 支持支持
nettman 2014-07-28
  • 打赏
  • 举报
回复
qq_18526005 2014-07-27
  • 打赏
  • 举报
回复
厉害..............

579

社区成员

发帖
与我相关
我的任务
社区描述
CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。
社区管理员
  • CUDA编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧