2,408
社区成员
发帖
与我相关
我的任务
分享
__shared__ float maxnum[64], minnum[64];
for(int i = 0; i < 64; ++i)
{
__syncthreads();
data[threadIdx.x] = spec[(blockIdx.x << 16) + i * blockDim.x + threadIdx.x];
__syncthreads();
//max
for(int offset = blockDim.x / 2; offset >= 1; offset = offset / 2)
{
__syncthreads();
if(threadIdx.x < offset)
{
data[threadIdx.x] = data[threadIdx.x] > data[threadIdx.x + offset] ? data[threadIdx.x] : data[threadIdx.x + offset];
}
__syncthreads();
}]
__syncthreads();
//min
for(int offset = blockDim.x / 2; offset >= 1; offset = offset / 2)
{
__syncthreads();
if(threadIdx.x >= blockDim.x - offset)
{
data[threadIdx.x] = data[threadIdx.x] < data[threadIdx.x - offset] ? data[threadIdx.x] : data[threadIdx.x - offset];
}
__syncthreads();
}
__syncthreads();
if(threadIdx.x == 0)
{
maxnum[i] = data[0];
minnum[i] = data[1023];
}
__syncthreads();
}
__syncthreads();
if(threadIdx.x < 32)
{
for(int offset = 32; offset >= 1; offset >>= 1)
{
if(threadIdx.x < offset)
{
maxnum[threadIdx.x] = maxnum[threadIdx.x] > maxnum[threadIdx.x + offset] ? maxnum[threadIdx.x] : maxnum[threadIdx.x + offset];
minnum[threadIdx.x] = minnum[threadIdx.x] < maxnum[threadIdx.x + offset] ? minnum[threadIdx.x] : minnum[threadIdx.x + offset];
}
__syncthreads();
}
}
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cufft.h>
#include <math.h>
__global__ void re(float * spec)
{
__shared__ float data[1024];
__shared__ float maxnum[64], minnum[64];
for(int i = 0; i < 64; ++i)
{
__syncthreads();
data[threadIdx.x] = spec[(blockIdx.x << 16) + i * blockDim.x + threadIdx.x];
__syncthreads();
//max
for(int offset = blockDim.x / 2; offset >= 1; offset = offset / 2)
{
__syncthreads();
if(threadIdx.x < offset)
{
data[threadIdx.x] = data[threadIdx.x] > data[threadIdx.x + offset] ? data[threadIdx.x] : data[threadIdx.x + offset];
}
__syncthreads();
}
__syncthreads();
//min
for(int offset = blockDim.x / 2; offset >= 1; offset = offset / 2)
{
__syncthreads();
if(threadIdx.x >= blockDim.x - offset)
{
data[threadIdx.x] = data[threadIdx.x] < data[threadIdx.x - offset] ? data[threadIdx.x] : data[threadIdx.x - offset];
}
__syncthreads();
}
__syncthreads();
if(threadIdx.x == 0)
{
maxnum[i] = data[0];
minnum[i] = data[1023];
}
__syncthreads();
}
__syncthreads();
if(threadIdx.x < 32)
{
for(int offset = 32; offset >= 1; offset >>= 1)
{
if(threadIdx.x < offset)
{
maxnum[threadIdx.x] = maxnum[threadIdx.x] > maxnum[threadIdx.x + offset] ? maxnum[threadIdx.x] : maxnum[threadIdx.x + offset];
minnum[threadIdx.x] = minnum[threadIdx.x] < maxnum[threadIdx.x + offset] ? minnum[threadIdx.x] : minnum[threadIdx.x + offset];
}
__syncthreads();
}
}
__syncthreads();
if(threadIdx.x == 0)
printf("max = %f min = %f\n", maxnum[0], minnum[0]);
}
int main()
{
float *spec = (float*)malloc(sizeof(float) * 65536);
float *d_spec;
for(int i = 0; i < 65536; i++)
{
spec[i]= 1;
}
spec[0]=5;
spec[10254]=-15;
dim3 dimGrid(1,1,1);
dim3 dimBlock(1024,1,1);
cudaMalloc((void **)&d_spec, 65536 * sizeof(float));
cudaMemset(d_spec, 0, 65536 * sizeof(float));
cudaMemcpy(d_spec, spec, sizeof(float) * 65536, cudaMemcpyHostToDevice);
re<<<dimGrid,dimBlock>>>(d_spec);
free(spec);
cudaFree(d_spec);
}
以这个代码为例,结果不正确。 maxnum = 5 minnun = 1