Incorrect ulong 64bit integer addition results. Works fine with Oclgrind, but fails on GPU

I have implemented the SHA512 hashing algorithm, and it was working fine while i was sticking with oclgrind. When I switched to the real GPU, the result turned out to be completely different.

When I was running Oclgrind, I was gettting the correct output:

2712195685281767701

But when I switched to the GPU, I got this one:

1628432321777253681

I think it might be an issue with ulong 64bit integers. After hours of debugging, I must admit that incorrect results appear after adding two ulong integers at the // Compress words lines.

Any thoughts?

EDIT: Also tried on other hardware. The results are different everywhere.
EDIT 2: I’m going crazy, please help me :upside_down_face:. Also it might be something with memory.

#define SHA512_H0 0x6a09e667f3bcc908UL
#define SHA512_H1 0xbb67ae8584caa73bUL
#define SHA512_H2 0x3c6ef372fe94f82bUL
#define SHA512_H3 0xa54ff53a5f1d36f1UL
#define SHA512_H4 0x510e527fade682d1UL
#define SHA512_H5 0x9b05688c2b3e6c1fUL
#define SHA512_H6 0x1f83d9abfb41bd6bUL
#define SHA512_H7 0x5be0cd19137e2179UL

#define sha512_rotr(x, n) ((x >> n) | (x << 64UL - n))
#define sha512_sigma0(x)                                                       \
  (sha512_rotr(x, 1UL) ^ sha512_rotr(x, 8UL) ^ (x >> 7UL))
#define sha512_sigma1(x)                                                       \
  (sha512_rotr(x, 19UL) ^ sha512_rotr(x, 61UL) ^ (x >> 6UL))
#define sha512_usigma0(x)                                                      \
  (sha512_rotr(x, 28UL) ^ sha512_rotr(x, 34UL) ^ sha512_rotr(x, 39UL))
#define sha512_usigma1(x)                                                      \
  (sha512_rotr(x, 14UL) ^ sha512_rotr(x, 18UL) ^ sha512_rotr(x, 41UL))
#define sha512_choice(x, y, z) ((x & y) ^ (~x & z))
#define sha512_majority(x, y, z) ((x & y) ^ (x & z) ^ (y & z))

__constant ulong sha512K[80] = {
    0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL,
    0xe9b5dba58189dbbcUL, 0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
    0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, 0xd807aa98a3030242UL,
    0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
    0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL,
    0xc19bf174cf692694UL, 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL,
    0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, 0x2de92c6f592b0275UL,
    0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
    0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL,
    0xbf597fc7beef0ee4UL, 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL,
    0x06ca6351e003826fUL, 0x142929670a0e6e70UL, 0x27b70a8546d22ffcUL,
    0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
    0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL,
    0x92722c851482353bUL, 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL,
    0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, 0xd192e819d6ef5218UL,
    0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
    0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL,
    0x34b0bcb5e19b48a8UL, 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL,
    0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL, 0x748f82ee5defb2fcUL,
    0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
    0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL,
    0xc67178f2e372532bUL, 0xca273eceea26619cUL, 0xd186b8c721c0c207UL,
    0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, 0x06f067aa72176fbaUL,
    0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
    0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL,
    0x431d67c49c100d4cUL, 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL,
    0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL};

#define sha512_digest_to_bytes(startfrom, V)                                   \
  outbuffer[startfrom + 0] = (uchar)(V >> 56);                                 \
  outbuffer[startfrom + 1] = (uchar)(V >> 48 & 0xFF);                          \
  outbuffer[startfrom + 2] = (uchar)(V >> 40 & 0xFF00 >> 8);                   \
  outbuffer[startfrom + 3] = (uchar)(V >> 32 & 0xFF0000 >> 16);                \
  outbuffer[startfrom + 4] = (uchar)(V >> 24 & 0xFF000000 >> 24);              \
  outbuffer[startfrom + 5] = (uchar)(V >> 16 & 0xFF00000000 >> 32);            \
  outbuffer[startfrom + 6] = (uchar)(V >> 8 & 0xFF0000000000 >> 40);           \
  outbuffer[startfrom + 7] = (uchar)(V & 0xFF000000000000 >> 48);

static void sha512_hash(const uchar *inbuffer, const int length,
                        uchar *outbuffer) {
  // WARNING: No values longer than 64 bytes are expected
  ulong words[80];

  int total_msg_blocks = length / 8;
  bool done = false;

  for (int block_i = 0; block_i < 80; block_i++) {
    words[block_i] = 0;
    if (done || block_i >= total_msg_blocks) {
      continue; // Just continue filling blocks with 0
    }

    for (int i = 0; i < 8; i++) {
      if (block_i * 8 + i == length) {
        done = true;
        break;
      }

      words[block_i] <<= 8UL;
      words[block_i] |= inbuffer[block_i * 8 + i];
    }
  }

  if (length % 8) {
    words[total_msg_blocks - 1] <<= 1UL;
    words[total_msg_blocks - 1] |= 0b1UL;
  } else {
    words[total_msg_blocks] = 0x8000000000000000UL;
  }

  words[15] = length * 8;

  // Prepare the message schedules
  for (int i = 16; i < 80; i++) {
    words[i] = sha512_sigma1(words[i - 2]) + words[i - 7] +
               sha512_sigma0(words[i - 15]) + words[i - 16];
    // printf("SCH %llu\n", words[i]);
  }

  // Initialize working variables
  ulong A = SHA512_H0;
  ulong B = SHA512_H1;
  ulong C = SHA512_H2;
  ulong D = SHA512_H3;
  ulong E = SHA512_H4;
  ulong F = SHA512_H5;
  ulong G = SHA512_H6;
  ulong H = SHA512_H7;
  // printf("val %llu\n", A);

  ulong T1, T2;
  // Compress the words
  for (int i = 0; i < 80; i++) {
    // ulong T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + H +
    // sha512K[i] +
    //           words[i];
    T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + sha512K[i] + words[i];
    T2 = sha512_usigma0(A) + sha512_majority(A, B, C);

    // Compress words.
    // !!!! THIS IS WHERE GPU FAILS !!!!
    H = G;
    G = F;
    F = E;
    E = D + T1;
    D = C;
    C = B;
    B = A;
    A = T1 + T2;
  }

  A = A + SHA512_H0;
  B = B + SHA512_H1;
  C = C + SHA512_H2;
  D = D + SHA512_H3;
  E = E + SHA512_H4;
  F = F + SHA512_H5;
  G = G + SHA512_H6;
  H = H + SHA512_H7;

  sha512_digest_to_bytes(0, A);
  sha512_digest_to_bytes(8, B);
  sha512_digest_to_bytes(16, C);
  sha512_digest_to_bytes(24, D);
  sha512_digest_to_bytes(32, E);
  sha512_digest_to_bytes(40, F);
  sha512_digest_to_bytes(48, G);
  sha512_digest_to_bytes(56, H);

  // Print the resu;t
  printf("result %llu\n", A);
}

__kernel hash() {
  int gid = get_global_id(0);
  if (gid != 0) {
    return;
  }

  uchar inbuffer[64];
  inbuffer[0] = 176;
  inbuffer[1] = 109;
  inbuffer[2] = 96;
  inbuffer[3] = 146;
  inbuffer[4] = 49;
  inbuffer[5] = 105;
  inbuffer[6] = 16;
  inbuffer[7] = 154;
  inbuffer[8] = 248;
  inbuffer[9] = 101;
  inbuffer[10] = 35;
  inbuffer[11] = 55;
  inbuffer[12] = 75;
  inbuffer[13] = 153;
  inbuffer[14] = 252;
  inbuffer[15] = 205;
  inbuffer[16] = 162;
  inbuffer[17] = 220;
  inbuffer[18] = 42;
  inbuffer[19] = 42;
  inbuffer[20] = 59;
  inbuffer[21] = 235;
  inbuffer[22] = 85;
  inbuffer[23] = 245;
  inbuffer[24] = 87;
  inbuffer[25] = 217;
  inbuffer[26] = 51;
  inbuffer[27] = 250;
  inbuffer[28] = 20;
  inbuffer[29] = 236;
  inbuffer[30] = 144;
  inbuffer[31] = 17;
  inbuffer[32] = 85;
  inbuffer[33] = 160;
  inbuffer[34] = 117;
  inbuffer[35] = 50;
  inbuffer[36] = 158;
  inbuffer[37] = 126;
  inbuffer[38] = 228;
  inbuffer[39] = 40;
  inbuffer[40] = 97;
  inbuffer[41] = 239;
  inbuffer[42] = 252;
  inbuffer[43] = 30;
  inbuffer[44] = 182;
  inbuffer[45] = 184;
  inbuffer[46] = 125;
  inbuffer[47] = 14;
  inbuffer[48] = 17;
  inbuffer[49] = 87;
  inbuffer[50] = 65;
  inbuffer[51] = 191;
  inbuffer[52] = 153;
  inbuffer[53] = 10;
  inbuffer[54] = 216;
  inbuffer[55] = 64;
  inbuffer[56] = 49;
  inbuffer[57] = 210;
  inbuffer[58] = 172;
  inbuffer[59] = 97;
  inbuffer[60] = 205;
  inbuffer[61] = 24;
  inbuffer[62] = 95;
  inbuffer[63] = 216;
  uchar outbuffer[64];
  sha512_hash(inbuffer, 64, outbuffer);
}

P.S. I you feel lucky, you can run the kernel on your hardware and share your result :smiley:

Well, manually unrolled this

  for (int i = 0; i < 80; i++) {
    // ulong T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + H +
    // sha512K[i] +
    //           words[i];
    T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + sha512K[i] + words[i];
    T2 = sha512_usigma0(A) + sha512_majority(A, B, C);

    H = G;
    G = F;
    F = E;
    E = D + T1;
    D = C;
    C = B;
    B = A;
    A = T1 + T2;
  }

to this

#define sha512_round(i)                                                        \
  T1 = H + sha512_usigma1(E) + sha512_choice(E, F, G) + sha512K[i] + words[i]; \
  T2 = sha512_usigma0(A) + sha512_majority(A, B, C);                           \
  H = G;                                                                       \
  G = F;                                                                       \
  F = E;                                                                       \
  E = D + T1;                                                                  \
  D = C;                                                                       \
  C = B;                                                                       \
  B = A;                                                                       \
  A = T1 + T2;
// END DEFINITION
  sha512_round(0);
  sha512_round(1);
  sha512_round(2);
  sha512_round(3);
  sha512_round(4);
  sha512_round(5);
  sha512_round(6);
  sha512_round(7);
  sha512_round(8);
  sha512_round(9);
  sha512_round(10);
  sha512_round(11);
  sha512_round(12);
  sha512_round(13);
  sha512_round(14);
  sha512_round(15);
  sha512_round(16);
  sha512_round(17);
  sha512_round(18);
  sha512_round(19);
  sha512_round(20);
  sha512_round(21);
  sha512_round(22);
  sha512_round(23);
  sha512_round(24);
  sha512_round(25);
  sha512_round(26);
  sha512_round(27);
  sha512_round(28);
  sha512_round(29);
  sha512_round(30);
  sha512_round(31);
  sha512_round(32);
  sha512_round(33);
  sha512_round(34);
  sha512_round(35);
  sha512_round(36);
  sha512_round(37);
  sha512_round(38);
  sha512_round(39);
  sha512_round(40);
  sha512_round(41);
  sha512_round(42);
  sha512_round(43);
  sha512_round(44);
  sha512_round(45);
  sha512_round(46);
  sha512_round(47);
  sha512_round(48);
  sha512_round(49);
  sha512_round(50);
  sha512_round(51);
  sha512_round(52);
  sha512_round(53);
  sha512_round(54);
  sha512_round(55);
  sha512_round(56);
  sha512_round(57);
  sha512_round(58);
  sha512_round(59);
  sha512_round(60);
  sha512_round(61);
  sha512_round(62);
  sha512_round(63);
  sha512_round(64);
  sha512_round(65);
  sha512_round(66);
  sha512_round(67);
  sha512_round(68);
  sha512_round(69);
  sha512_round(70);
  sha512_round(71);
  sha512_round(72);
  sha512_round(73);
  sha512_round(74);
  sha512_round(75);
  sha512_round(76);
  sha512_round(77);
  sha512_round(78);
  sha512_round(79);

and everything started working perfectly fine. opencl is is a (not) nice programming language.

Well, done some changes to the code. It is not working again. I love opencl…

OK after changing

__kernel hash() {

to

__kernel void hash(void) {

on NVIDIA GeForce GTX 1050, drivers 446.14 Win10 Pro x64. Also on Intel® HD Graphics 630 and on Intel® Core™ i5-7300HQ CPU @ 2.50GHz.

What is your configuration and OpenCL drivers?

I found a configuration giving wrong results on the kernel. It is Tahiti on drivers from 16.6.2 on (16.200.1035.0, OpenCL 2079.5). From 22.19.677.257 (OpenCL 2442.9) this kernel even raises an exception in the driver. However OpenCL compiler for GCN1 card is discontinued. I can suggest to use Radeon Crimson 16.4.2 drivers as a workaround, if it is the case.

I found a possible problem that can give you such a strange behaviour completely without driver problem. As 6.12.13.3 of OpenCL 1.2 specification states (as well as the end of paragraph 6.13.13 of OpenCL 2.0 specification), %llu specification for printf is not supported, in principle you should use %lu for your ulong A variable.