One question is compute shader stage being exposed in Nvidia Cg language (where to ask?) similar to tesselation stages where added in Cg 3.0… also if not, really another question is if Nvidia Cg compiler cgc.exe will at least get support for that so we can compile GLSL compute shaders to NV_compute_shader assembly similar to which I get from driver when assembly errors occur (see below)…
So this is really the kind of bug in compute shader that I’m afraid Nvidia will not fix for security/hanging kind of thing:
I was happy to see I could port Nvidia matmul CUDA sample to OGL compute shader without much effort basically doing two things:
*Adding defines:
#define blockIdx gl_WorkGroupID
#define __syncthreads() barrier()
#define threadIdx gl_GlobalInvocationID
#define __shared__ shared
and changing all int vars in CUDA kernel to uint vars (another solution would have been to cast gl_WorkGroupID and gl_GlobalInvocationID to int)
2.and another one was to put out shared mem definitions out of kernel to be in global scope
__shared__ REAL As[BLOCK_SIZE][BLOCK_SIZE];
Then I was happy code to be able to be free of errors but a shame is in compilation:
Internal error: assembly compile error for compute shader at offset 108543:
-- error message --
line 2094, column 6: error: BAR not allowed inside flow control blocks.
line 2108, column 6: error: BAR not allowed inside flow control blocks.
First let’s be clear this code works in CUDA and recommended by Nvidia for it’s GPUs… second it’s using a fine detail in GPUs in that barriers can be in control flow assuming all warps in block execute follow same path in control flow…
why not in OGL compute shaders?..
also see the “flow control” is cause by this loop:
for (uint a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
not a branch and even more is executed same iterations for every thread but it’s not trivial in the form stated to infer by compiler so I thought hey fix it i changed to (in my sample matrix w=h=1024 and block size=32 so iters=1024/32):
uint a = aBegin, b = bBegin;
for (int i=0; i<32; i++)
{
..loop body containing barriers
a += aStep, b += bStep
}
even with this I get same error so unique way to fix is to unroll loop manually which is ugly at least… I think even I added #pragma unroll to loop to force programatically to expand the compiler for me but not working … there is a Nvidia way to force unroll like in CUDA with #pragma unroll
Full shader:
#version 430 core
#extension GL_ARB_compute_shader : require
#extension GL_ARB_shader_storage_buffer_object : require
#extension GL_ARB_gpu_shader_fp64: require
#define REAL float
//#define REAL double
#define BUFFER_A 0
#define BUFFER_B 1
#define BUFFER_C 2
#define BLOCK_SIZE 32
layout (local_size_x = BLOCK_SIZE, local_size_y = BLOCK_SIZE) in;
layout(binding = BUFFER_A) buffer iBuffer
{
REAL A[];
} ;
layout(binding = BUFFER_B) buffer oBuffer
{
REAL B[];
} ;
layout(binding = BUFFER_C) buffer oBuffer
{
REAL C[];
} ;
//matrixMul(float *C, float *A, float *B, int wA, int wB)
#define blockIdx gl_WorkGroupID
#define __syncthreads() barrier()
//memoryBarrierShared();
#define threadIdx gl_GlobalInvocationID
#define __shared__ shared
#define wA 1024
#define wB 1024
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ REAL As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ REAL Bs[BLOCK_SIZE][BLOCK_SIZE];
void main()
{
// Block index
uint bx = blockIdx.x;
uint by = blockIdx.y;
// Thread index
uint tx = threadIdx.x;
uint ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
uint aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
uint aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
uint aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
uint bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
uint bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
REAL Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
/*for (uint a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)*/
uint a = aBegin, b = bBegin;
for (int i=0; i<32; i++)
{
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
a += aStep, b += bStep;
}
// Write the block sub-matrix to device memory;
// each thread writes one element
uint c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
Full assembly:
Compute info
------------
Internal error: assembly compile error for compute shader at offset 108543:
-- error message --
line 2094, column 6: error: BAR not allowed inside flow control blocks.
line 2108, column 6: error: BAR not allowed inside flow control blocks.
-- internal assembly text --
!!NVcp5.0
OPTION NV_shader_storage_buffer;
OPTION NV_shader_atomic_float;
GROUP_SIZE 32 32;
# cgc version 3.1.0001, build date Aug 8 2012
# command line args:
#vendor NVIDIA Corporation
#version 3.1.0.1
#profile gp5cp
#program main
#semantic As : SHARED
#semantic Bs : SHARED
#semantic oBuffer : SBO_BUFFER[2]
#semantic iBuffer : SBO_BUFFER[0]
#semantic oBuffer : SBO_BUFFER[1]
#var float Bs[0][0] : : shared_mem[8188] : -1 : 1
# lots more
#var float Bs[31][31] : : shared_mem[8188] : -1 : 1
#var uint3 gl_GlobalInvocationID : $vin.GBLID : GBLID[4] : -1 : 1
#var uint3 gl_WorkGroupID : $vin.CTAID : CTAID[2] : -1 : 1
#var float C[0] : : sbo_buffer[2][0] : -1 : 1
#var float A[0] : : sbo_buffer[0][0] : -1 : 1
#var float B[0] : : sbo_buffer[1][0] : -1 : 1
SHARED_MEMORY 8192;
SHARED shared_mem[] = { program.sharedmem };
STORAGE sbo_buf0[] = { program.storage[0] };
STORAGE sbo_buf1[] = { program.storage[1] };
STORAGE sbo_buf2[] = { program.storage[2] };
TEMP R0, R1;
MOV.F R0.y, {0, 0, 0, 0}.x;
MUL.U R0.z, invocation.groupid.y, {32768, 0, 0, 0}.x;
MUL.U R0.w, invocation.groupid.x, {32, 0, 0, 0}.x;
REP.S {32, 0, 0, 0};
MAD.U R1.x, invocation.globalid.y, {1024, 0, 0, 0}, R0.w;
MAD.U R0.x, invocation.globalid.y, {1024, 0, 0, 0}, R0.z;
ADD.U R0.x, invocation.globalid, R0;
MUL.S R1.y, R0.x, {4, 0, 0, 0}.x;
ADD.U R1.x, invocation.globalid, R1;
MUL.S R0.x, invocation.globalid.y, {128, 0, 0, 0};
MOV.U R1.z, R1.y;
MAD.S R1.y, invocation.globalid.x, {4, 0, 0, 0}.x, R0.x;
LDB.F32 R0.x, sbo_buf0[R1.z];
MOV.U R1.y, R1;
STS.F32 R0, shared_mem[R1.y];
MUL.S R1.x, R1, {4, 0, 0, 0};
MOV.U R0.x, R1;
LDB.F32 R0.x, sbo_buf1[R0.x];
STS.F32 R0, shared_mem[R1.y + 4096];
BAR ;
MOV.S R1.y, {0, 0, 0, 0}.x;
REP.S {32, 0, 0, 0};
MUL.S R1.x, R1.y, {128, 0, 0, 0};
MAD.S R1.x, invocation.globalid, {4, 0, 0, 0}, R1;
MUL.S R0.x, invocation.globalid.y, {128, 0, 0, 0};
MAD.S R0.x, R1.y, {4, 0, 0, 0}, R0;
MOV.U R1.z, R1.x;
MOV.U R1.x, R0;
LDS.F32 R0.x, shared_mem[R1.z + 4096];
LDS.F32 R1.x, shared_mem[R1.x];
MAD.F R0.y, R1.x, R0.x, R0;
ADD.S R1.y, R1, {1, 0, 0, 0}.x;
ENDREP;
BAR ;
ADD.U R0.z, R0, {32, 0, 0, 0}.x;
ADD.U R0.w, R0, {32768, 0, 0, 0}.x;
ENDREP;
MUL.U R0.x, invocation.groupid, {32, 0, 0, 0};
MAD.U R0.x, invocation.groupid.y, {32768, 0, 0, 0}, R0;
MAD.U R0.x, invocation.globalid.y, {1024, 0, 0, 0}, R0;
ADD.U R0.x, R0, invocation.globalid;
MUL.S R0.x, R0, {4, 0, 0, 0};
MOV.U R0.x, R0;
STB.F32 R0.y, sbo_buf2[R0.x];
END
# 44 instructions, 2 R-regs