global调用global替换方法

Alllllex 2014-11-25 02:35:24

下边是一个哈希值转换的global函数，通过main.cu调用 Hash<<<1,62>>>(input_d, nonce_d);
暂时实现了把整个哈希函数放在GPU上跑，现在想进一步的并行优化，这个global里边的其他device函数也利用并行运算提高速度，想把他们也改成global提示hash.cuh(124): error: calling a __global__ function("OutputFunction") from a __global__ function("Hash") is only allowed on the compute_35 architecture or above
请问有其他的方法解决这个问题吗？不知道问题描述明白了没，新人一枚，需要补充的请告诉我，现行谢过。

//#include "hash.h"

#include <stdio.h>

#include <string.h>



#define MS 32

#define BW  8

#define BL  32

#define BI 128



__device__ __constant__ char _hex[] = "0123456789ABCDEF";





__global__ void Hash(char* ,char* );

__device__ void OutputFunction(uint32_t*,uint32_t*);

__device__ void InputFunction(uint32_t*,uint32_t*,uint32_t*);

__device__ void RoundFunction(uint32_t*,uint32_t*);

__device__ char hex(int*);

__device__ void stringtohex_BE(char*, char*);



__device__

uint32_t ROR(uint32_t x, int y){

    int y_mod = ((y & 0x1F) + 32) & 0x1F;

    return ROR32(x, y_mod);

}

 

__device__

inline unsigned int index2(unsigned int i, unsigned int j){

    return (unsigned int) (i*BW+j);

}



__device__

char hex(int nibble){

    return _hex[nibble];

}



__device__

void stringtohex_BE(char* in, char* out){

    int j=0;

     for(int i=0;i<64;i+=2)

    {

        out[i] = hex((in[j] & 0xF0) >> 4);

        out[i+1]= hex((in[j] & 0x0F) >> 0);

        j++;

    }

    out[64]='\0';

    return;

}



__device__

bool check_hash(char* hash){

    //check if first character is a zero

    for(int i=0;i<LEADING_ZEROES;i++)

        if (hash[i]!='0')

            return false;

    return true;

}

//note: output must be 32+1 chars (+1 for termination of string)

__global__

void Hash(char* input, char* nonce)

{

	char nonce_[62]={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',

	'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',

	'0','1','2','3','4','5','6','7','8','9'};

	char output[65];

	char output_str[65];

	int tx = threadIdx.x;

	

    //uint32_t a[MS];

    //uint32_t b[BL*BW];

    uint32_t in[BW];

    uint32_t out[2];

   

	input[0]=nonce_[tx];

    //uint32_t inputSize=(uint32_t)strlen(input); INPUT_SIZE

    uint32_t inputSize=(uint32_t)(INPUT_SIZE+NONCE_SIZE);



    //init with zeros

    /*for(unsigned int i=0; i<MS; i++)

        a[i] = 0;

    for(unsigned int i=0; i<BL*BW; i++)

        b[i] = 0;*/

	uint32_t a[MS]={0};

	uint32_t b[BL*BW]={0};

	

    unsigned int p = 0;

    while(p+sizeof(uint32_t)*BW <=inputSize) {

        for(unsigned int q=0; q<BW; q++) {

            in[q] = 0;

            for(unsigned int w=0; w<sizeof(uint32_t); w++)

                in[q] |= (uint32_t)((unsigned char)(input[p+q*sizeof(uint32_t)+w])) << (8*w);

        }

        p += sizeof(uint32_t)*BW;

        InputFunction(in,a,b);

        RoundFunction(a,b);

    }

    

    //uint32_t test[BW+1]={};

    //padding

    //char last_block[(BW+1)*sizeof(uint32_t)/sizeof(char)];

    //for(int i=0;i<((BW+1)*sizeof(uint32_t)/sizeof(char));i++)

      //  last_block[i]=0;

	char last_block[(BW+1)*sizeof(uint32_t)/sizeof(char)]={0};

    for(uint32_t i=0;i<inputSize-p;i++)

        last_block[i]=input[p+i];

    last_block[inputSize-p]=(char) 0x01;

    

    for(unsigned int q=0; q<BW; q++) {

        in[q] = 0;

        for(unsigned int w=0; w<sizeof(uint32_t); w++)

            in[q] |= (uint32_t)((unsigned char)(last_block[q*sizeof(uint32_t)+w])) << (8*w);

    }

    //free(last_block);

    

    InputFunction(in,a,b);

    RoundFunction(a,b);

   

   //do some iterations without new input

    for(uint32_t i=0; i<BI; i++)

        RoundFunction(a,b);

    

    //collect 32 output characters

    for(uint32_t i=0;i<32/(2*sizeof(uint32_t));i++){

        RoundFunction(a,b);

        OutputFunction(out,a);

		

        for(unsigned int q=0; q<2; q++)

            for(unsigned int w=0; w<sizeof(uint32_t); w++)

                output[i*sizeof(uint32_t)*2+q*sizeof(uint32_t)+w] = (char)((out[q] >> (8*w)) & 0xFF);

    }

    output[32]='\0';

	stringtohex_BE(output,output_str);

			if (check_hash(output_str))nonce[0]=input[0];

}



__device__

void RoundFunction(uint32_t* a,uint32_t* b)

{

    uint32_t q[BW];

    for(unsigned int j=0; j<BW; j++)

        q[j] = b[index2(BL-1,j)];



    for(unsigned int i=BL-1; i>0; i--)

        for(unsigned int j=0; j<BW; j++)

            b[index2(i,j)] = b[index2(i-1,j)];

    

    for(unsigned int j=0; j<BW; j++)

        b[index2(0,j)] = q[j];



    

    for(unsigned int i=0; i<12; i++)

        b[index2(i+1,i%BW)] ^= a[i+1];



   

    uint32_t A[MS];

    

    for(unsigned int i=0; i<MS; i++)

        A[i] = a[i]^(a[(i+1)%MS]|(~a[(i+2)%MS]));

   

    for(unsigned int i=0; i<MS; i++)

        a[i] = ROR(A[(7*i)%MS], i*(i+1)/2);

    

    for(unsigned int i=0; i<MS; i++)

        A[i] = a[i]^a[(i+1)%MS]^a[(i+4)%MS];

   

    A[0] ^= 1;

   

    for(unsigned int i=0; i<MS; i++) 

        a[i] = A[i];



   

    for(unsigned int j=0; j<BW; j++)

        a[j+13] ^= q[j];

}



__device__

void InputFunction(uint32_t* in,uint32_t* a,uint32_t* b)

{

   

    for(unsigned int j=0; j<BW; j++) 

        a[j+16] ^= in[j];

    

    for(unsigned int j=0; j<BW; j++) 

        b[index2(0,j)] ^= in[j];

}



__device__

void OutputFunction(uint32_t* out,uint32_t* a)  

{

    for(unsigned int j=0; j<2; j++)

        out[j] = a[j+1];

}

...全文

485 4 打赏收藏转发到动态举报

写回复

用AI写文章

4 条回复

切换为时间正序

请发表友善的回复…

发表回复

Alllllex 2014-11-26

打赏
举报

嗯嗯，循环嵌套确实多，开始就是直接把各个Hash里边的循环嵌套直接的用cuda thread运行，速度慢得不行。。现在思路大概清晰了些先把整个Hash放入GPU并行运算，然后再把Hash里边的各个RoundFunction InputFunction OutputFunction循环嵌套也优化成并行。。然后就卡在目前上边的情况了，因为main里边只有一个block然后把62thread给了nonce，想着可不可以多给几个thread一部分给nonce一部分给RoundFunction InputFunction OutputFunction里边的循环。。简单地试了一下OutputFunction貌似找不到coin（源程序是利用hash函数找”虚拟比特币“），估计是哪部分转换错了。。还有一个想法是在main调用Hash改成多个blcok和thread，本来以为优化要求一次只能一个input只能一个block。理论上应该是可以使用多个block的。。不知道这种想法可行不。？

xiah_sunny 2014-11-26

打赏
举报

->暂时实现了把整个哈希函数放在GPU上跑，现在想进一步的并行优化<- 有没有用什么profiler看看？cuda用的是visual profiler？opencl里有个概念叫GPR，就是每个thread占用了太多的自愿导致同时运行的线程块很少，kernel速度就慢了，不知道cuda有没有这个概念... 从代码里看各种循环嵌套，速度可能确实很慢...

linxxx3 2014-11-25

打赏
举报

从lz的描述看，你需要的是 dynamic parallelism。cuda 5.0以上支持，从kernel（就是global函数）里启动新的kernel，语法跟从cpu启动kernel相同。一般使用的方法，是用id=0的线程去启动一个新kernel。这个机制的一个主要目的，是减少移植已有程序的修改量，因为基本不用改原先的函数调用关系。要实现kernel内部的函数，并且加速，用global函数还是device函数，都同样可以做到，写法不同而已。lz 去查查 dynamic parallelism，看是不是能用得上。

Alllllex 2014-11-25

打赏
举报

kernelF<<<(1,1),(16,16)>>>(A);
__device__ kernelF(A){
__shared__ smem[16][16];
i = threadIdx.y;
j = threadIdx.x;
smem[i][j] = A[i][j]; // load to smem
A[i][j] = ( smem[i-1][j-1] + smem[i-1][j] +
… + smem[i+1][i+1] ) / 9;
}

这种方法？请问有其他的解决方法吗？在无法提高cuda版本的情况下，因为是在服务器上修改practice，所以我不能提高cuda版本，只能单纯的从原有代码进行修改。尝试过上边的这种，提示的是a __device__ function call cannot be configured 然后昨晚还尝试把main里边的nonce用thread给device函数例如最后边的output使用，写成

int j=threadIdx.x;
if(j<2)
	out[j] = a[j+1];

或者在main里边

cudaMemcpy(input_d, input, (INPUT_SIZE+NONCE_SIZE+1)*sizeof(char),
                              cudaMemcpyHostToDevice);
cudaMemcpy(nonce_d, nonce, sizeof(char),
                              cudaMemcpyHostToDevice);
            //calculate hash
            Hash<<<1,64>>>(input_d, nonce_d);
cudaMemcpy(input, input_d, (INPUT_SIZE+NONCE_SIZE+1)*sizeof(char),
                              cudaMemcpyDeviceToHost);
cudaMemcpy(nonce, nonce_d, sizeof(char),
                              cudaMemcpyDeviceToHost);

把Hash<<<1,62>>>改写成1,64然后