关于opencl 在 amd卡跟nvidia卡上运行结果的区别
这个问题在其他地方也问过,但是没有得到答案。目前严重困扰中。
同样的程序,在nvidia卡上运行结果正确,但是在amd卡上结果错误。(开发环境是vs2012+最新的驱动),试验了多台n卡和a卡的机器(卡片类型不同)都是这样的结果,n卡一个结果,a卡一个结果(不同的a卡结果是相同的,虽然是错误的)。
问题出在哪里?下面是完整的kernel代码。
// kernel source code -------------------------------------------
typedef union USHA1_type
{
unsigned int sha1uint[5];
unsigned char sha1uchar[20];
}USHA1_t;
inline uint SWAP32(uint x)
{
x = rotate(x, 16U);
return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF);
}
//sha1 ==================
#define K1 0x5A827999
#define K2 0x6ED9EBA1
#define K3 0x8F1BBCDC
#define K4 0xCA62C1D6
#define H1 0x67452301
#define H2 0xEFCDAB89
#define H3 0x98BADCFE
#define H4 0x10325476
#define H5 0xC3D2E1F0
#define F1(x,y,z) (z ^ (x & (y ^ z)))
#define F2(x,y,z) (x ^ y ^ z)
#define F3(x,y,z) ((x & y) | (z & (x | y)))
#define F4(x,y,z) (x ^ y ^ z)
#define R(t) (temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ W[(t - 14) & 0x0F] ^ W[t & 0x0F], ( W[t & 0x0F] = rotate((int)temp,1) ) )
#define P1(a,b,c,d,e,x) \
{ \
e += rotate((int)a,5) + F1(b,c,d) + K1 + x; b = rotate((int)b,30);\
}
#define P2(a,b,c,d,e,x) \
{ \
e += rotate((int)a,5) + F2(b,c,d) + K2 + x; b = rotate((int)b,30);\
}
#define P3(a,b,c,d,e,x) \
{ \
e += rotate((int)a,5) + F3(b,c,d) + K3 + x; b = rotate((int)b,30);\
}
#define P4(a,b,c,d,e,x) \
{ \
e += rotate((int)a,5) + F4(b,c,d) + K4 + x; b = rotate((int)b,30);\
}
//1-63 BYTES sha1
inline void sha1_crypt(__private unsigned char *plain, unsigned int plainlen, __private unsigned int *digest)
{
int t;
int stop, mmod;
unsigned int i, ulen;
unsigned int W[16] = {0};
unsigned int temp, A,B,C,D,E;
A = H1;
B = H2;
C = H3;
D = H4;
E = H5;
for (t = 1; t < 15; t++)
{
W[t] = 0x00000000;
}
i = plainlen;
stop = i / 4 ;
for (t = 0 ; t < stop ; t++){
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
W[t] |= (unsigned char) plain[t * 4 + 3];
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
W[t] |= ((unsigned char) 0x80) ;
} else if (mmod == 2) {
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
ulen = (i * 8) & 0xFFFFFFFF;
W[15] = ulen ;
P1( A, B, C, D, E, W[0] );
P1( E, A, B, C, D, W[1] );
P1( D, E, A, B, C, W[2] );
P1( C, D, E, A, B, W[3] );
P1( B, C, D, E, A, W[4] );
P1( A, B, C, D, E, W[5] );
P1( E, A, B, C, D, W[6] );
P1( D, E, A, B, C, W[7] );
P1( C, D, E, A, B, W[8] );
P1( B, C, D, E, A, W[9] );
P1( A, B, C, D, E, W[10] );
P1( E, A, B, C, D, W[11] );
P1( D, E, A, B, C, W[12] );
P1( C, D, E, A, B, W[13] );
P1( B, C, D, E, A, W[14] );
P1( A, B, C, D, E, W[15] );
P1( E, A, B, C, D, R(16) );
P1( D, E, A, B, C, R(17) );
P1( C, D, E, A, B, R(18) );
P1( B, C, D, E, A, R(19) );
P2( A, B, C, D, E, R(20) );
P2( E, A, B, C, D, R(21) );
P2( D, E, A, B, C, R(22) );
P2( C, D, E, A, B, R(23) );
P2( B, C, D, E, A, R(24) );
P2( A, B, C, D, E, R(25) );
P2( E, A, B, C, D, R(26) );
P2( D, E, A, B, C, R(27) );
P2( C, D, E, A, B, R(28) );
P2( B, C, D, E, A, R(29) );
P2( A, B, C, D, E, R(30) );
P2( E, A, B, C, D, R(31) );
P2( D, E, A, B, C, R(32) );
P2( C, D, E, A, B, R(33) );
P2( B, C, D, E, A, R(34) );
P2( A, B, C, D, E, R(35) );
P2( E, A, B, C, D, R(36) );
P2( D, E, A, B, C, R(37) );
P2( C, D, E, A, B, R(38) );
P2( B, C, D, E, A, R(39) );
P3( A, B, C, D, E, R(40) );
P3( E, A, B, C, D, R(41) );
P3( D, E, A, B, C, R(42) );
P3( C, D, E, A, B, R(43) );
P3( B, C, D, E, A, R(44) );
P3( A, B, C, D, E, R(45) );
P3( E, A, B, C, D, R(46) );
P3( D, E, A, B, C, R(47) );
P3( C, D, E, A, B, R(48) );
P3( B, C, D, E, A, R(49) );
P3( A, B, C, D, E, R(50) );
P3( E, A, B, C, D, R(51) );
P3( D, E, A, B, C, R(52) );
P3( C, D, E, A, B, R(53) );
P3( B, C, D, E, A, R(54) );
P3( A, B, C, D, E, R(55) );
P3( E, A, B, C, D, R(56) );
P3( D, E, A, B, C, R(57) );
P3( C, D, E, A, B, R(58) );
P3( B, C, D, E, A, R(59) );
P4( A, B, C, D, E, R(60) );
P4( E, A, B, C, D, R(61) );
P4( D, E, A, B, C, R(62) );
P4( C, D, E, A, B, R(63) );
P4( B, C, D, E, A, R(64) );
P4( A, B, C, D, E, R(65) );
P4( E, A, B, C, D, R(66) );
P4( D, E, A, B, C, R(67) );
P4( C, D, E, A, B, R(68) );
P4( B, C, D, E, A, R(69) );
P4( A, B, C, D, E, R(70) );
P4( E, A, B, C, D, R(71) );
P4( D, E, A, B, C, R(72) );
P4( C, D, E, A, B, R(73) );
P4( B, C, D, E, A, R(74) );
P4( A, B, C, D, E, R(75) );
P4( E, A, B, C, D, R(76) );
P4( D, E, A, B, C, R(77) );
P4( C, D, E, A, B, R(78) );
P4( B, C, D, E, A, R(79) );
digest[0] = SWAP32(A + H1);
digest[1] = SWAP32(B + H2);
digest[2] = SWAP32(C + H3);
digest[3] = SWAP32(D + H4);
digest[4] = SWAP32(E + H5);
}
__kernel void test_sha1_kernel(__global unsigned int* gout)
{
unsigned int id = get_global_id(0);
unsigned int i = 0;
//two input data
unsigned char InData1[16] = {0};
unsigned char InData2[16] = {0};
//two calout data
USHA1_t sha1out1;
USHA1_t sha1out2;
//init data
for(i = 0; i < 5; i ++)
{
sha1out1.sha1uint[i] = 0;
sha1out2.sha1uint[i] = 0;
}
for(i = 0; i < 16; i ++)
{
InData1[i] = 0x03;
InData2[i] = 0x38;
}
//two out temp
unsigned char out1[4] = {0};
unsigned char out2[4] = {0};
for(i = 0; i < 4; i++)
{
out1[i] = 0;
out2[i] = 0;
}
//cal 1
unsigned int *psha1out1 = (unsigned int *)(sha1out1.sha1uint);
sha1_crypt(InData1, 8, psha1out1);
sha1_crypt(InData1, 8, psha1out1);
//save output1
for(i = 0; i < 4; i++)
{
out1[i] = sha1out1.sha1uchar[i];
}
//cal 2
unsigned int *psha1out2 = (unsigned int *)(sha1out2.sha1uint);
sha1_crypt(InData2, 8, psha1out2);
//save output2
for(i = 0; i < 4; i++)
{
out2[i] = sha1out2.sha1uchar[i];
}
//out to cpu
if(id == 0)
{
gout[0] = (unsigned int)out1[0];
gout[1] = (unsigned int)out1[1];
gout[2] = (unsigned int)out1[2];
gout[3] = (unsigned int)out1[3];
gout[4] = (unsigned int)out2[0];
gout[5] = (unsigned int)out2[1];
gout[6] = (unsigned int)out2[2];
gout[7] = (unsigned int)out2[3];
}
}