603
社区成员
发帖
与我相关
我的任务
分享
extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi);
extern "C"
void cudaInit();
//#define USE_CUDA
void StereoTest(void)
{
const int disp_num = 64;
cudaInit();
Mat imgLeft = imread("1-L.png",0);
Mat imgRight= imread("1-R.png",0);
cv::Size img_size;
img_size.width = 1024;
img_size.height = 400;
cv::resize(imgLeft, imgLeft, img_size);
cv::resize(imgRight, imgRight, img_size);
int height = imgLeft.rows;
int width = imgLeft.cols;
u16 *dsi =(u16 *)malloc(sizeof(int)*width*height*disp_num);
u16 *dsiAgg=(u16 *)malloc(sizeof(int)*width*height*disp_num);
cout << "runing..." <<endl;
#ifdef USE_CUDA
memset(dsiAgg, 0, height * width * disp_num* sizeof(unsigned short));
cudaCensus(imgLeft, imgRight, dsiAgg);
#else
clock_t start = clock();
Census((uchar *)imgLeft.data, (uchar *)imgRight.data, height, width, disp_num, dsi);
printf("time of census: %.3f s\n", (double)(clock()-start)/CLOCKS_PER_SEC);
clock_t TransposeDSI_clock = clock();
TransposeDSI(width, height, disp_num, dsiAgg, dsi);
printf("time of TransposeDSI: %.3f s\n", (double)(clock()-TransposeDSI_clock)/CLOCKS_PER_SEC);
printf("total time: %.3f s\n\n\n", (double)(clock()-start)/CLOCKS_PER_SEC);
#endif
free(dsi);
free(dsiAgg);
}
int main(int argc, char *argv[])
{
StereoTest();
return 0;
}
#include "stereo.h"
inline uint64 getCensus(uchar* source, int width, int i, int j)
{
uint64 value = 0;
uchar center = source[i*width + j];
for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
value = (value << 1);
value += (source[(i+m)*width+j+n] < center);
}
}
return value;
}
uchar hamDist16(ushort x, ushort y)
{
uchar dist = 0;
ushort val = (x^y); // yihuo
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}
void Census(uchar *left, uchar *right, int height, int width, int dispnum, u16 *dsi)
{
static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
uint64 *leftCensus = new uint64[width];
uint64 *rightCensus = new uint64[width];
if(firstRun)
{
for (int i = 0; i < 65536; i++) {
popcount16LUT[i] = hamDist16(i,0);
}
memset(dsi,0,height*width*dispnum*sizeof(u16));
firstRun = false;
}
for (int i = 3; i < height - 3; i++)
{
for (int j = 4; j < width - 4; j++)
{
leftCensus[j] = getCensus(left, width, i, j);
rightCensus[j] = getCensus(right, width, i, j);
}
// Hamming Distance
for (int j = 4; j < width - 4; j++)
{
for (int d = 0; d < dispnum; d++)
{
if( d <= j - 4)
{
// total 44 bytes
dsi[(i*width+j)*dispnum + d] = popcount16LUT[ (ushort)(leftCensus[j] ^ rightCensus[j-d]) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>16) ^ (rightCensus[j-d]>>16)) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>32) ^ (rightCensus[j-d]>>32)) ];
}
else
{
dsi[(i*width+j)*dispnum + d] = 0xff;
}
}
}
}
delete [] leftCensus;
delete [] rightCensus;
}
void TransposeDSI(int width, int height, int n_disp, u16 *dsiSrc, u16 *dsiDes)
{
int index1, index2;
for(int j=0;j<height;j++)
{
for(int i=0;i<width;i++)
{
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
memcpy(dsiDes+index2, dsiSrc+index1, sizeof(u16)*n_disp);
}
}
}
static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
static const int dispnum = 64;
static unsigned char hamDist_16(ushort x, ushort y)
{
unsigned char dist = 0;
unsigned short val = (x^y);
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}
__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{
int i = threadIdx.x;
int j = blockIdx.x;
int index1, index2;
int d;
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
dsiDes += index2;
dsiSrc += index1;
for(d = 0; d < n_disp; d++) {
dsiDes[d] = dsiSrc[d];
dsiSrc[d] = 0;
}
//memcpy(dsiDes+index2, dsiSrc+index1, sizeof(unsigned short)*n_disp);
}
__global__ void getCensus(unsigned char *left_data, unsigned char *right_data,
unsigned char *popcount16LUT, unsigned short *dsi,
int height, int width)
{
__shared__ unsigned long long int leftCensus[1024];
__shared__ unsigned long long int rightCensus[1024];
unsigned long long int left_val = 0;
unsigned long long int right_val = 0;
unsigned long long int val_l;
unsigned long long int val_r;
int tid = threadIdx.x;
int bid = blockIdx.x;
if(tid >= 4 && bid >= 3) {
unsigned char left_center = left_data[bid*width + tid];
unsigned char right_center = right_data[bid*width + tid];
for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
left_val = (left_val << 1);
left_val += (left_data[(bid+m) * width + tid + n] < left_center);
right_val = (right_val << 1);
right_val += (right_data[(bid+m) * width + tid + n] < right_center);
}
}
leftCensus[tid] = left_val;
rightCensus[tid] = right_val;
__syncthreads();
for (int d = 0; d < 64; d++)
{
if( d <= tid - 4)
{
val_l = leftCensus[tid];
val_r = rightCensus[tid-d];
// total 44 bytes
dsi[(bid*width+tid)*64 + d] = popcount16LUT[ (unsigned short)(val_l ^ val_r)]
+ popcount16LUT[(unsigned short)((val_l>>16) ^ (val_r>>16))]
+ popcount16LUT[(unsigned short)((val_l>>32) ^ (val_r>>32))];
}
else
{
dsi[(bid*width+tid)*64 + d] = 0xff;
}
}
}
else {
leftCensus[tid] = 0;
rightCensus[tid] = 0;
}
}
uchar *left_data, *right_data;
float *disp_img;
unsigned short *out_dist;
unsigned short *dist_agg;
extern "C"
void cudaInit()
{
HANDLE_ERROR(cudaMalloc( (void**)&left_data, WIDTH * HEIGHT * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&right_data, HEIGHT * WIDTH * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&out_dist, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&dist_agg, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&disp_img, WIDTH * HEIGHT *sizeof(float)));
if(firstRun)
{
HANDLE_ERROR(cudaMalloc( (void**)&popcount16LUT, 65536));
static uchar *tmp_popcount16LUT = new uchar[65536];
// build the popcount16LUT
for (int i = 0; i < 65536; i++) {
tmp_popcount16LUT[i] = hamDist_16(i,0);
}
HANDLE_ERROR(cudaMemcpy(popcount16LUT, tmp_popcount16LUT, 65536, cudaMemcpyHostToDevice));
firstRun = false;
}
}
extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi) {
int height = left_img.rows;
int width = left_img.cols;
int data_size = height * width * sizeof(uchar);
HANDLE_ERROR(cudaMemcpy(left_data, left_img.data, data_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(right_data, right_img.data, data_size, cudaMemcpyHostToDevice));
cudaEvent_t getCensus_start, getCensus_stop;
cudaEventCreate(&getCensus_start);
cudaEventCreate(&getCensus_stop);
cudaEventRecord(getCensus_start, NULL);
getCensus<<<height, width>>>(left_data, right_data, popcount16LUT, out_dist, height, width);
cudaDeviceSynchronize();
cudaEventRecord(getCensus_stop, NULL);
cudaEventSynchronize(getCensus_stop);
float getCensus_time = 0.0f;
cudaEventElapsedTime(&getCensus_time, getCensus_start, getCensus_stop);
cudaEventDestroy(getCensus_start);
cudaEventDestroy(getCensus_stop);
std::cout << " getCensus time=" << getCensus_time<<std::endl;
/* TransposeDSI */
cudaEvent_t TransposeDSI_1_start, TransposeDSI_1_stop;
cudaEventCreate(&TransposeDSI_1_start);
cudaEventCreate(&TransposeDSI_1_stop);
cudaEventRecord(TransposeDSI_1_start, NULL);
TransposeDSI<<<height, width>>>(out_dist, dist_agg, width, height, dispnum);
cudaDeviceSynchronize();
cudaEventRecord(TransposeDSI_1_stop, NULL);
cudaEventSynchronize(TransposeDSI_1_stop);
float TransposeDSI_1_time = 0.0f;
cudaEventElapsedTime(&TransposeDSI_1_time, TransposeDSI_1_start, TransposeDSI_1_stop);
cudaEventDestroy(TransposeDSI_1_start);
cudaEventDestroy(TransposeDSI_1_stop);
std::cout << " TransposeDSI time=" << TransposeDSI_1_time<<std::endl;
HANDLE_ERROR(cudaMemcpy(dsi, dist_agg, width * height *sizeof(unsigned short), cudaMemcpyDeviceToHost));
return 0;
}