推荐一个CUDA编程的好东西（二）

_梦魇花葬 2014-07-27 08:56:25

加精

引用

http://blog.csdn.net/augusdi/article/details/12434077

7、页锁定主机存储器Page-lockedHostmemory
代码：



#include"cuda_runtime.h"

#include"device_launch_parameters.h"

#include<stdio.h>

#include<stdlib.h>

#defineN256

voidSetArrayValue(float*pData,intsize,floatvalue)

{

for(inti=0;i<size;++i)

{

pData[i]=value;

}

}

__global__voidVecAdd(float*A,float*B,float*C)

{

intindex=threadIdx.x;

C[index]=A[index]+B[index];

}

voidPrintArray(float*data,intsize)

{

for(inti=0;i<size;++i)

{

if((i+1)%10==0)

{

printf("\n");

}

printf("%f",data[i]);

}

}

voidmain()

{

//cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无

cudaDevicePropdeviceProp;

cudaGetDeviceProperties(&deviceProp,0);

if(deviceProp.integrated)

{

printf("GPUisintegrated\n");

return;

}

if(!deviceProp.canMapHostMemory)

{

printf("can'tmaphostmemory\n");

return;

}

float*hostPtrA;

cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);

SetArrayValue(hostPtrA,N,29);

PrintArray(hostPtrA,N);

system("pause");

float*hostPtrB;

cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);

SetArrayValue(hostPtrB,N,31);

float*devPtrA,*devPtrB;

cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);

cudaHostGetDevicePointer(&devPtrB,hostPtrB,0);

float*hostPtrC;

cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault|cudaHostAllocMapped);

float*devPtrC;

cudaHostGetDevicePointer(&devPtrC,hostPtrC,0);

VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC);

cudaDeviceSynchronize();

for(inti=0;i<N;i++)

{

if((i+1)%10==0)

{

printf("\n");

}

printf("%f",hostPtrC[i]);

}

system("pause");

}

8、纹理存储的使用texturememory

代码：#include"cuda_runtime.h"

#include"device_launch_parameters.h"

#include<stdio.h>

#include<stdlib.h>

#definesize256

texture<float,cudaTextureType2D,cudaReadModeElementType>texRef;

__global__voidtransformKernel(float*output,intwidth,intheight,floattheta)

{

unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;

unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;

floatu=x/(float)width;

floatv=y/(float)height;

u-=0.5f;

v-=0.5f;

floattu=u*cosf(theta)-v*sin(theta)+0.5f;

floattv=v*cosf(theta)+u*sinf(theta)+0.5f;

output[y*width+x]=tex2D(texRef,tu,tv);

}

voidmain()

{

intwidth=25,height=25;

cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

cudaArray*cuArray;

cudaMallocArray(&cuArray,&channelDesc,width,height);

float*h_data=(float*)malloc(width*height*sizeof(float));

for(inti=0;i<height;++i)

{

for(intj=0;j<width;++j)

{

h_data[i*width+j]=i*width+j;

}

}

cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice);

texRef.addressMode[0]=cudaAddressModeWrap;

texRef.addressMode[1]=cudaAddressModeWrap;

texRef.filterMode=cudaFilterModeLinear;

texRef.normalized=true;

cudaBindTextureToArray(texRef,cuArray,channelDesc);

float*output;

cudaMalloc(&output,width*height*sizeof(float));

dim3dimBlock(16,16);

dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);

floatangle=30;

transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle);

float*hostPtr=(float*)malloc(sizeof(float)*width*height);

cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost);

for(inti=0;i<height;++i)

{

for(intj=0;j<width;++j)

{

printf("%f",hostPtr[i*width+j]);

}

printf("\n");

}

free(hostPtr);

cudaFreeArray(cuArray);

cudaFree(output);

system("pause");

}

9、surfaceMemory的使用方法
代码：



#include"cuda_runtime.h"

#include"device_launch_parameters.h"

#include<stdio.h>

#include<stdlib.h>

surface<void,2>inputSurfRef;

surface<void,2>outputSurfRef;

__global__voidcopyKernel(intwidth,intheight)

{

unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;

unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;

if(x<width&&y<height)

{

uchar4data;

surf2Dread(&data,inputSurfRef,x*4,y);

surf2Dwrite(data,outputSurfRef,x*4,y);

}

}

voidmain()

{

intwidth=256,height=256;

unsignedint*h_data=(unsignedint*)malloc(width*height*sizeof(unsignedint));

for(inti=0;i<height;++i)

{

for(intj=0;j<width;++j)

{

h_data[i*width+j]=3;

}

}

intsize=width*height*sizeof(unsignedint);

cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);

cudaArray*cuInputArray;

cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArrayzurfaceLoadStore);

cudaArray*cuOutputArray;

cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);

cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice);

cudaBindSurfaceToArray(inputSurfRef,cuInputArray);

cudaBindSurfaceToArray(outputSurfRef,cuOutputArray);

dim3dimBlock(16,16);

dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);

copyKernel<<<dimGrid,dimBlock>>>(width,height);

unsignedint*host_output=(unsignedint*)malloc(sizeof(unsignedint)*width*height);

cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost);

for(inti=0;i<height;++i)

{

for(intj=0;j<width;++j)

{

printf("%u",host_output[i*width+j]);

}

printf("\n");

}

system("pause");

free(host_output);

free(h_data);

cudaFreeArray(cuInputArray);

cudaFreeArray(cuOutputArray);

}

10、opengl和cuda的交互
代码：

引用

https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/
http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7



#include<cuda.h>

#include"cuda_runtime.h"

#include"device_launch_parameters.h"

#include<gl/glew.h>//要放在下面这一句的前面

#include"cuda_gl_interop.h"

#include<stdio.h>

#include<stdlib.h>

//#include<gl/GL.h>

#include<gl/glut.h>

GLuintpostionsVBO=1;

structcudaGraphicsResource*postionsVBO_CUDA;

intwidth=256;

intheight=256;

__device__floatdev_time=1;

floathost_time=1;

__global__voidcreateVertices(float4*positions,floattime,unsignedintwidth,unsignedintheight);

voidinit()

{

glClearColor(0.f,0.f,0.f,1.f);

glClear(GL_DEPTH_BUFFER_BIT|GL_COLOR_BUFFER_BIT);

}

voidreshape(intwidth,intheight)

{

glMatrixMode(GL_PROJECTION);

glLoadIdentity();

if(width>height)

{

gluPerspective(45,(GLfloat)width/height,0.001,1000);

}else

{

gluPerspective(45,(GLfloat)height/width,0.001,1000);

}

glMatrixMode(GL_MATRIX_MODE);

glLoadIdentity();

}

voiddisplay()

{

float4*positions;

cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);

size_tnumb_bytes;

cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA);

dim3dimBlock(16,16,1);

dim3dimGrid(width/dimBlock.x,height/dimBlock.y,1);

createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height);

dev_time++;

cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice);

cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0);

glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);

glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

glVertexPointer(4,GL_FLOAT,0,0);

glEnableClientState(GL_VERTEX_ARRAY);

glDrawArrays(GL_POINTS,0,width*height);

glDisableClientState(GL_VERTEX_ARRAY);

glutSwapBuffers();

glutPostRedisplay();

}

__global__voidcreateVertices(float4*positions,floattime,unsignedintwidth,unsignedintheight)

{

unsignedintx=blockIdx.x*blockDim.x+threadIdx.x;

unsignedinty=blockIdx.y*blockDim.y+threadIdx.y;

floatu=x/(float)width;

floatv=y/(float)height;

u=u*2.f-1.f;

v=v*2.f-1.f;

floatfreq=4.f;

floatw=sinf(u*freq+time)*cosf(v*freq+time)*0.5f;

positions[y*width+x]=make_float4(u,w,v,1.f);

}

intmain(intargc,char*argv[])

{

cudaGLSetGLDevice(0);

glutInit(&argc,argv);

glutInitDisplayMode(GLUT_DOUBLE|GLUT_RGB);

glutInitWindowPosition(0,0);

glutInitWindowSize(100,100);

glutCreateWindow("opengl-cuda");

init();

glutDisplayFunc(display);

glutReshapeFunc(reshape);

glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010

glGenBuffers(1,&postionsVBO);

glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

unsignedintsize=width*height*4*sizeof(float);

glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER,0);

cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard);

glutMainLoop();

}

11、Formattedoutput---printf函数在device的函数中，但是其需要其的computecopability至少为2.0
代码：



#include<cuda.h>

#include<helper_cuda.h>

#include"cuda_runtime.h"

#include"device_launch_parameters.h"

__global__voidbcast()

{

printf("%d\n",threadIdx.x);

}

voidmain()

{

bcast<<<1,32>>>();

cudaDeviceSynchronize();

system("pause");

}

12、Asserting在设备端的函数中，但是其要求其计算能力至少为2.0代码：



#include<cuda.h>

#include<helper_cuda.h>

#include"cuda_runtime.h"

#include"device_launch_parameters.h"

#include<assert.h>

#include<stdlib.h>

__global__voidtestAssert(void)

{

intis_one=1;

intshould_be_one=0;

assert(is_one);

assert(should_be_one);

}

voidmain()

{

testAssert<<<1,1>>>();

cudaDeviceSynchronize();

cudaDeviceReset();

system("pause");

}

13、PerThreadAllocationOnheap每个线程在堆上分配
代码：



#include<stdio.h>

#include<stdlib.h>

#include<cuda.h>

#include<helper_cuda.h>

#include"cuda_runtime.h"

#include"device_launch_parameters.h"

__global__voidmallocTestPerThread()

{

char*ptr=(char*)malloc(100);

printf("Thread%dgotpointer:%p\n",threadIdx.x,ptr);

free(ptr);

}

intmain()

{

cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

mallocTestPerThread<<<1,5>>>();

cudaDeviceSynchronize();

system("pause");

return0;

}

14、PerThreadBlockAllocation每个线程块在堆上分配空间
代码：



#include<stdio.h>

#include<stdlib.h>

#include<cuda.h>

#include<helper_cuda.h>

#include"cuda_runtime.h"

#include"device_launch_parameters.h"

__global__voidmallocTestPerThreadBlock()

{

__shared__int*data;

if(threadIdx.x==0)

{

data=(int*)malloc(blockDim.x*64);

}

__syncthreads();

if(data==NULL)

{

return;

}

int*ptr=data;

for(inti=0;i<64;++i)

{

ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;

}

__syncthreads();

if(threadIdx.x==0)

{

free(data);

}

}

intmain()

{

cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

mallocTestPerThreadBlock<<<10,128>>>();

cudaDeviceSynchronize();

system("pause");

return0;

}