I have implemented the SHA512 hashing algorithm, and it was working fine while i was sticking with oclgrind. When I switched to the real GPU, the result turned out to be completely different.
When I was running Oclgrind, I was gettting the correct output:
2712195685281767701
But when I switched to the GPU, I got this one:
1628432321777253681
I think it might be an issue with ulong 64bit integers. After hours of debugging, I must admit that incorrect results appear after adding two ulong
integers at the // Compress words
lines.
Any thoughts?
EDIT: Also tried on other hardware. The results are different everywhere.
EDIT 2: I’m going crazy, please help me . Also it might be something with memory.
#define SHA512_H0 0x6a09e667f3bcc908UL
#define SHA512_H1 0xbb67ae8584caa73bUL
#define SHA512_H2 0x3c6ef372fe94f82bUL
#define SHA512_H3 0xa54ff53a5f1d36f1UL
#define SHA512_H4 0x510e527fade682d1UL
#define SHA512_H5 0x9b05688c2b3e6c1fUL
#define SHA512_H6 0x1f83d9abfb41bd6bUL
#define SHA512_H7 0x5be0cd19137e2179UL
#define sha512_rotr(x, n) ((x >> n) | (x << 64UL - n))
#define sha512_sigma0(x) \
(sha512_rotr(x, 1UL) ^ sha512_rotr(x, 8UL) ^ (x >> 7UL))
#define sha512_sigma1(x) \
(sha512_rotr(x, 19UL) ^ sha512_rotr(x, 61UL) ^ (x >> 6UL))
#define sha512_usigma0(x) \
(sha512_rotr(x, 28UL) ^ sha512_rotr(x, 34UL) ^ sha512_rotr(x, 39UL))
#define sha512_usigma1(x) \
(sha512_rotr(x, 14UL) ^ sha512_rotr(x, 18UL) ^ sha512_rotr(x, 41UL))
#define sha512_choice(x, y, z) ((x & y) ^ (~x & z))
#define sha512_majority(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
__constant ulong sha512K[80] = {
0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL,
0xe9b5dba58189dbbcUL, 0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, 0xd807aa98a3030242UL,
0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL,
0xc19bf174cf692694UL, 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL,
0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, 0x2de92c6f592b0275UL,
0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL,
0xbf597fc7beef0ee4UL, 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL,
0x06ca6351e003826fUL, 0x142929670a0e6e70UL, 0x27b70a8546d22ffcUL,
0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL,
0x92722c851482353bUL, 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL,
0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, 0xd192e819d6ef5218UL,
0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL,
0x34b0bcb5e19b48a8UL, 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL,
0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL, 0x748f82ee5defb2fcUL,
0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL,
0xc67178f2e372532bUL, 0xca273eceea26619cUL, 0xd186b8c721c0c207UL,
0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, 0x06f067aa72176fbaUL,
0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL,
0x431d67c49c100d4cUL, 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL,
0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL};
#define sha512_digest_to_bytes(startfrom, V) \
outbuffer[startfrom + 0] = (uchar)(V >> 56); \
outbuffer[startfrom + 1] = (uchar)(V >> 48 & 0xFF); \
outbuffer[startfrom + 2] = (uchar)(V >> 40 & 0xFF00 >> 8); \
outbuffer[startfrom + 3] = (uchar)(V >> 32 & 0xFF0000 >> 16); \
outbuffer[startfrom + 4] = (uchar)(V >> 24 & 0xFF000000 >> 24); \
outbuffer[startfrom + 5] = (uchar)(V >> 16 & 0xFF00000000 >> 32); \
outbuffer[startfrom + 6] = (uchar)(V >> 8 & 0xFF0000000000 >> 40); \
outbuffer[startfrom + 7] = (uchar)(V & 0xFF000000000000 >> 48);
static void sha512_hash(const uchar *inbuffer, const int length,
uchar *outbuffer) {
// WARNING: No values longer than 64 bytes are expected
ulong words[80];
int total_msg_blocks = length / 8;
bool done = false;
for (int block_i = 0; block_i < 80; block_i++) {
words[block_i] = 0;
if (done || block_i >= total_msg_blocks) {
continue; // Just continue filling blocks with 0
}
for (int i = 0; i < 8; i++) {
if (block_i * 8 + i == length) {
done = true;
break;
}
words[block_i] <<= 8UL;
words[block_i] |= inbuffer[block_i * 8 + i];
}
}
if (length % 8) {
words[total_msg_blocks - 1] <<= 1UL;
words[total_msg_blocks - 1] |= 0b1UL;
} else {
words[total_msg_blocks] = 0x8000000000000000UL;
}
words[15] = length * 8;
// Prepare the message schedules
for (int i = 16; i < 80; i++) {
words[i] = sha512_sigma1(words[i - 2]) + words[i - 7] +
sha512_sigma0(words[i - 15]) + words[i - 16];
// printf("SCH %llu\n", words[i]);
}
// Initialize working variables
ulong A = SHA512_H0;
ulong B = SHA512_H1;
ulong C = SHA512_H2;
ulong D = SHA512_H3;
ulong E = SHA512_H4;
ulong F = SHA512_H5;
ulong G = SHA512_H6;
ulong H = SHA512_H7;
// printf("val %llu\n", A);
ulong T1, T2;
// Compress the words
for (int i = 0; i < 80; i++) {
// ulong T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + H +
// sha512K[i] +
// words[i];
T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + sha512K[i] + words[i];
T2 = sha512_usigma0(A) + sha512_majority(A, B, C);
// Compress words.
// !!!! THIS IS WHERE GPU FAILS !!!!
H = G;
G = F;
F = E;
E = D + T1;
D = C;
C = B;
B = A;
A = T1 + T2;
}
A = A + SHA512_H0;
B = B + SHA512_H1;
C = C + SHA512_H2;
D = D + SHA512_H3;
E = E + SHA512_H4;
F = F + SHA512_H5;
G = G + SHA512_H6;
H = H + SHA512_H7;
sha512_digest_to_bytes(0, A);
sha512_digest_to_bytes(8, B);
sha512_digest_to_bytes(16, C);
sha512_digest_to_bytes(24, D);
sha512_digest_to_bytes(32, E);
sha512_digest_to_bytes(40, F);
sha512_digest_to_bytes(48, G);
sha512_digest_to_bytes(56, H);
// Print the resu;t
printf("result %llu\n", A);
}
__kernel hash() {
int gid = get_global_id(0);
if (gid != 0) {
return;
}
uchar inbuffer[64];
inbuffer[0] = 176;
inbuffer[1] = 109;
inbuffer[2] = 96;
inbuffer[3] = 146;
inbuffer[4] = 49;
inbuffer[5] = 105;
inbuffer[6] = 16;
inbuffer[7] = 154;
inbuffer[8] = 248;
inbuffer[9] = 101;
inbuffer[10] = 35;
inbuffer[11] = 55;
inbuffer[12] = 75;
inbuffer[13] = 153;
inbuffer[14] = 252;
inbuffer[15] = 205;
inbuffer[16] = 162;
inbuffer[17] = 220;
inbuffer[18] = 42;
inbuffer[19] = 42;
inbuffer[20] = 59;
inbuffer[21] = 235;
inbuffer[22] = 85;
inbuffer[23] = 245;
inbuffer[24] = 87;
inbuffer[25] = 217;
inbuffer[26] = 51;
inbuffer[27] = 250;
inbuffer[28] = 20;
inbuffer[29] = 236;
inbuffer[30] = 144;
inbuffer[31] = 17;
inbuffer[32] = 85;
inbuffer[33] = 160;
inbuffer[34] = 117;
inbuffer[35] = 50;
inbuffer[36] = 158;
inbuffer[37] = 126;
inbuffer[38] = 228;
inbuffer[39] = 40;
inbuffer[40] = 97;
inbuffer[41] = 239;
inbuffer[42] = 252;
inbuffer[43] = 30;
inbuffer[44] = 182;
inbuffer[45] = 184;
inbuffer[46] = 125;
inbuffer[47] = 14;
inbuffer[48] = 17;
inbuffer[49] = 87;
inbuffer[50] = 65;
inbuffer[51] = 191;
inbuffer[52] = 153;
inbuffer[53] = 10;
inbuffer[54] = 216;
inbuffer[55] = 64;
inbuffer[56] = 49;
inbuffer[57] = 210;
inbuffer[58] = 172;
inbuffer[59] = 97;
inbuffer[60] = 205;
inbuffer[61] = 24;
inbuffer[62] = 95;
inbuffer[63] = 216;
uchar outbuffer[64];
sha512_hash(inbuffer, 64, outbuffer);
}
P.S. I you feel lucky, you can run the kernel on your hardware and share your result