NVIDIA releases OpenGL 4.3 beta drivers

Also two questions before second bug report…
1.is multi_draw_indirect fully supported in current Nvidia HW (and Fermi) without CPU support in driver or is implemented some part in software I say that because this originated from AMD extension so seems AMD really supports in HW…
2.seems fbo_no_atachments extensions is equivalent to Dx11.1 TIR target independant rasterization if so how it’s possible implementable in Nvidia first gen d3d11 HW (fermi) if I read on some sites that TIR was just one thing current D3D 11 HW doesn’t supported… what are the differences then?
Second sample is in 430-compute shader sample seems I get fixed by removing readonly and writeonly qualifiers from storage buffer and then fixing a bug by g-truc
ie changing

Out.Ouput[gl_GlobalInvocationID].Texcoord = In.Input[gl_GlobalInvocationID].Texcoord;

by

Out.Ouput[gl_GlobalInvocationID.x].Texcoord = In.Input[gl_GlobalInvocationID.x].Texcoord;

so seems
this should work but doesn’t work

layout(binding = BUFFER_INPUT) readonly buffer iBuffer
{
    vertex Input[];
} In;

layout(binding = BUFFER_OUTPUT) writeonly buffer oBuffer
{
    vertex Ouput[];
} Out;

i get:
Compute info

0(22) : error C0000: syntax error, unexpected identifier, expecting "::" at token "iBuffer"
0(27) : error C0000: syntax error, unexpected identifier, expecting "::" at token "oBuffer"
 and fixing it by ex:

//layout(binding = BUFFER_INPUT) readonly buffer iBuffer
layout(binding = BUFFER_INPUT)  buffer iBuffer //works!

At least for now with this fix it works!

One question is compute shader stage being exposed in Nvidia Cg language (where to ask?) similar to tesselation stages where added in Cg 3.0… also if not, really another question is if Nvidia Cg compiler cgc.exe will at least get support for that so we can compile GLSL compute shaders to NV_compute_shader assembly similar to which I get from driver when assembly errors occur (see below)…
So this is really the kind of bug in compute shader that I’m afraid Nvidia will not fix for security/hanging kind of thing:
I was happy to see I could port Nvidia matmul CUDA sample to OGL compute shader without much effort basically doing two things:
*Adding defines:

#define blockIdx gl_WorkGroupID
#define __syncthreads() barrier()
#define threadIdx gl_GlobalInvocationID
#define __shared__ shared

and changing all int vars in CUDA kernel to uint vars (another solution would have been to cast gl_WorkGroupID and gl_GlobalInvocationID to int)

2.and another one was to put out shared mem definitions out of kernel to be in global scope

__shared__ REAL As[BLOCK_SIZE][BLOCK_SIZE];

Then I was happy code to be able to be free of errors but a shame is in compilation:

Internal error: assembly compile error for compute shader at offset 108543:
-- error message --
line 2094, column 6:  error: BAR not allowed inside flow control blocks.
line 2108, column 6:  error: BAR not allowed inside flow control blocks.

First let’s be clear this code works in CUDA and recommended by Nvidia for it’s GPUs… second it’s using a fine detail in GPUs in that barriers can be in control flow assuming all warps in block execute follow same path in control flow…
why not in OGL compute shaders?..
also see the “flow control” is cause by this loop:

for (uint a = aBegin, b = bBegin;
         a <= aEnd;
         a += aStep, b += bStep)

not a branch and even more is executed same iterations for every thread but it’s not trivial in the form stated to infer by compiler so I thought hey fix it i changed to (in my sample matrix w=h=1024 and block size=32 so iters=1024/32):

uint a = aBegin, b = bBegin;
    for (int i=0; i<32; i++)
    {
..loop body containing barriers

a += aStep, b += bStep
}

even with this I get same error so unique way to fix is to unroll loop manually which is ugly at least… I think even I added #pragma unroll to loop to force programatically to expand the compiler for me but not working … there is a Nvidia way to force unroll like in CUDA with #pragma unroll
Full shader:

#version 430 core
#extension GL_ARB_compute_shader : require
#extension GL_ARB_shader_storage_buffer_object : require
#extension GL_ARB_gpu_shader_fp64: require

#define REAL float
//#define REAL double


#define BUFFER_A    0
#define BUFFER_B    1
#define BUFFER_C    2

#define BLOCK_SIZE 32


layout (local_size_x = BLOCK_SIZE, local_size_y = BLOCK_SIZE) in;

layout(binding = BUFFER_A) buffer iBuffer
{
    REAL A[];
} ;

layout(binding = BUFFER_B) buffer oBuffer
{
    REAL B[];
} ;

layout(binding = BUFFER_C) buffer oBuffer
{
    REAL C[];
} ;


//matrixMul(float *C, float *A, float *B, int wA, int wB)
#define blockIdx gl_WorkGroupID
#define __syncthreads() barrier()
//memoryBarrierShared();
#define threadIdx gl_GlobalInvocationID
#define __shared__ shared

#define wA 1024
#define wB 1024
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]

        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ REAL As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ REAL Bs[BLOCK_SIZE][BLOCK_SIZE];

void main()
{    

    // Block index
    uint bx = blockIdx.x;
    uint by = blockIdx.y;

    // Thread index
    uint tx = threadIdx.x;
    uint ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    uint aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    uint aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the sub-matrices of A
    uint aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    uint bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    uint bStep  = BLOCK_SIZE * wB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    REAL Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    /*for (uint a = aBegin, b = bBegin;
         a <= aEnd;
         a += aStep, b += bStep)*/
    uint a = aBegin, b = bBegin;
    for (int i=0; i<32; i++)
    {


        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + wA * ty + tx];
        BS(ty, tx) = B[b + wB * ty + tx];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
#pragma unroll
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    
        a += aStep, b += bStep;
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    uint c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;
}

Full assembly:

Compute info
------------
Internal error: assembly compile error for compute shader at offset 108543:
-- error message --
line 2094, column 6:  error: BAR not allowed inside flow control blocks.
line 2108, column 6:  error: BAR not allowed inside flow control blocks.
-- internal assembly text --
!!NVcp5.0
OPTION NV_shader_storage_buffer;
OPTION NV_shader_atomic_float;
GROUP_SIZE 32 32;
# cgc version 3.1.0001, build date Aug  8 2012
# command line args: 
#vendor NVIDIA Corporation
#version 3.1.0.1
#profile gp5cp
#program main
#semantic As : SHARED
#semantic Bs : SHARED
#semantic oBuffer : SBO_BUFFER[2]
#semantic iBuffer : SBO_BUFFER[0]
#semantic oBuffer : SBO_BUFFER[1]
#var float Bs[0][0] :  : shared_mem[8188] : -1 : 1
# lots more
#var float Bs[31][31] :  : shared_mem[8188] : -1 : 1
#var uint3 gl_GlobalInvocationID : $vin.GBLID : GBLID[4] : -1 : 1
#var uint3 gl_WorkGroupID : $vin.CTAID : CTAID[2] : -1 : 1
#var float C[0] :  : sbo_buffer[2][0] : -1 : 1
#var float A[0] :  : sbo_buffer[0][0] : -1 : 1
#var float B[0] :  : sbo_buffer[1][0] : -1 : 1
SHARED_MEMORY 8192;
SHARED shared_mem[] = { program.sharedmem };
STORAGE sbo_buf0[] = { program.storage[0] };
STORAGE sbo_buf1[] = { program.storage[1] };
STORAGE sbo_buf2[] = { program.storage[2] };
TEMP R0, R1;
MOV.F R0.y, {0, 0, 0, 0}.x;
MUL.U R0.z, invocation.groupid.y, {32768, 0, 0, 0}.x;
MUL.U R0.w, invocation.groupid.x, {32, 0, 0, 0}.x;
REP.S {32, 0, 0, 0};
MAD.U R1.x, invocation.globalid.y, {1024, 0, 0, 0}, R0.w;
MAD.U R0.x, invocation.globalid.y, {1024, 0, 0, 0}, R0.z;
ADD.U R0.x, invocation.globalid, R0;
MUL.S R1.y, R0.x, {4, 0, 0, 0}.x;
ADD.U R1.x, invocation.globalid, R1;
MUL.S R0.x, invocation.globalid.y, {128, 0, 0, 0};
MOV.U R1.z, R1.y;
MAD.S R1.y, invocation.globalid.x, {4, 0, 0, 0}.x, R0.x;
LDB.F32 R0.x, sbo_buf0[R1.z];
MOV.U R1.y, R1;
STS.F32 R0, shared_mem[R1.y];
MUL.S R1.x, R1, {4, 0, 0, 0};
MOV.U R0.x, R1;
LDB.F32 R0.x, sbo_buf1[R0.x];
STS.F32 R0, shared_mem[R1.y + 4096];
BAR  ;
MOV.S R1.y, {0, 0, 0, 0}.x;
REP.S {32, 0, 0, 0};
MUL.S R1.x, R1.y, {128, 0, 0, 0};
MAD.S R1.x, invocation.globalid, {4, 0, 0, 0}, R1;
MUL.S R0.x, invocation.globalid.y, {128, 0, 0, 0};
MAD.S R0.x, R1.y, {4, 0, 0, 0}, R0;
MOV.U R1.z, R1.x;
MOV.U R1.x, R0;
LDS.F32 R0.x, shared_mem[R1.z + 4096];
LDS.F32 R1.x, shared_mem[R1.x];
MAD.F R0.y, R1.x, R0.x, R0;
ADD.S R1.y, R1, {1, 0, 0, 0}.x;
ENDREP;
BAR  ;
ADD.U R0.z, R0, {32, 0, 0, 0}.x;
ADD.U R0.w, R0, {32768, 0, 0, 0}.x;
ENDREP;
MUL.U R0.x, invocation.groupid, {32, 0, 0, 0};
MAD.U R0.x, invocation.groupid.y, {32768, 0, 0, 0}, R0;
MAD.U R0.x, invocation.globalid.y, {1024, 0, 0, 0}, R0;
ADD.U R0.x, R0, invocation.globalid;
MUL.S R0.x, R0, {4, 0, 0, 0};
MOV.U R0.x, R0;
STB.F32 R0.y, sbo_buf2[R0.x];
END
# 44 instructions, 2 R-regs

From post below I want to ask some questions/suggestions:
*Would be good similar to Nvidia exposing NV_CG_shader extension for allowing “some” Cg functions (and syntax?) in GLSL code if exposed some call NV_CUDA_shader extension exposing to relaxation things in compute shaders to allow easier porting of CUDA kernels to GLSL (I’m smoking crack?):
first defines for all CUDA grid, threadid syntax and the likes having GLSL equivalents like:
#define blockIdx gl_WorkGroupID
#define __syncthreads() barrier()
#define threadIdx gl_GlobalInvocationID
#define shared shared
and defining like int and not uint like intrinsics in GLSL for mentions mentioned in previous post for portability questions
and second allow shared definitions inside compute shader functions which compiler will put out internally if needed…
with that these matrix mul code could be ported immediately!..
*Specification doesn’t inform about any restriction on usage of barriers inside control flow or restrictions in that case similar to CUDA for threads to follow same path reaching barrier… it’s this going to be fixed in spec or is IHV dependant…
if some restrictions apply in spec Nvidia could allow similar relaxation as done in CUDA by an extensions GL_NV_barriers_in_control_flow or similar to inform of that support…
*Also sorry but what is Directcompute restrictions in that case? there are any?
*Finally allow to programatically exposing work group size like by requiring local_size_x variables not to be constant like (new extension like GL_NV_uniform_local_size)

uniform int local_size_x_from_app;
layout (local_size_x = local_size_x_from_app) in;

or better new function
DispatchComputeSetLocalSize(globx,globy,globz,locx,locy,locz)
equivalent of CUDA support
kernelname<<globalsize,localsize>>(…)
I know that DirectCompute doesn’t support exposing local group size but CUDA yes…

Please answer questions!!

In the future, please use [ code ] blocks to format pieces of code in your posts.

I have fixed all using

.. thanks alfonse..

Another bad news altough this would be Nvidia only anyway for now: pointers on compute shaders not working on Nvidia..

I have tried to answer another question towards easy portability of GPGPU codes in GLSL compute shader:
using pointers in compute programs! of course using GL_NV_shader_buffer_load..
if I not enable parser correctly points:

0(47) : error C7531: pointers require “#extension GL_NV_shader_buffer_load : enable” before use


but if I enable I get:

0(13) : warning C7547: extension GL_NV_shader_buffer_load not supported in profile gp5cp


but GL_NV_shader_buffer_load is not supported in every stage? why not in compute?!:mad:
so Nvidia this is a limitation of implementation for now or forever (well more or less :)).. anyway this is supported on CUDA so surely supported in GPU HW for ages..

Hi oscar,
ran into the barrier problem during development as well, currently it seems like an oversight in the spec itself, if you look into 4.3 GLSL :

The barrier() function may only be placed inside the function main() of the tessellation control shader
and may not be called within any control flow. Barriers are also disallowed after a return statement in the
function main(). Any such misplaced barriers result in a compile-time error.

This hasn’t been specced in detail for compute yet. Ideally spec and implementation can be relaxed accordingly!

In general things are in an early stage yet, it will take a couple revisions until the implementation has all the features it should have.

Regarding your twitter on NV_shader_buffer_load, a quick compile test using

#extension GL_NV_gpu_shader5 : enable

did not give any errors when declaring some pointers. Although use of pointers on shared memory currently won’t be possible

Hi oscarbg

Thanks for the bug reports. We have looked at the g-truc OpenGL 4.3 samples and have a few driver fixes pending. There are also a few bugs in the OpenGL 4.3 samples, which I have reported to g-truc.

  1. The issue with calling imageSize() on an image declared with “coherent” has been fixed in the driver. The beta will be updated soon with this fix. Until then you can workaround the issue by removing “coherent” from the declaration.

  2. Yes, multi_draw_indirect is fully hardware accelerated on Fermi and above GPUs.

  3. fbo_no_attachments just allows an FBO to be “complete” without any attachments. I don’t think it’s equivalent to DX11 TIR.

  4. The “readonly” and “writeonly” memory qualifiers on buffer declarations is broken in the driver. We’re working on a fix for this. For now just remove these qualifiers.

  5. Some of your later questions might be better served in the “Official feedback on OpenGL 4.3 thread”.

Thanks for your feedback.

cheers
Piers

Sorry I’m behind on looking at these forum updates.

The error with #extension GL_NV_shader_buffer_load is a compiler bug that I was able to reproduce. The fix was an oversight in the logic handling “what #extension” directives are supported in which shader types/architectures. I’ll work on a fix for the next beta driver update.

As Christoph Kubisch mentions, one workaround for this is to enable the extension GL_NV_gpu_shader5, which also includes the ability to store to pointers. GL_NV_shader_buffer_load only allows pointer reads.

Many thanks for responses Christoph and Pat!
some updates on my findings:
First I think that comment about not currently possible to use codes that have barriers with control flow is correctly to be posted here because if not allowed in spec it’s here the place to motivate Nvidia to relax this restriction…
Reasons to allow it are big first and foremost:
1.D3D11 compute shader allows it!.. sorry to be pedantic but OGL ARB can’t claim OGL 4.3 is a superset of D3D11…
for example I have found in D3D SDK ComputeShaderSort11 sample in kernel BitonicSort
this code

for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
    {
        unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI];
        GroupMemoryBarrierWithGroupSync();
        shared_data[GI] = result;
        GroupMemoryBarrierWithGroupSync();
    }

well this code can’t be without control flow which I verified getting compiled DX IL assembly by fxc which has barriers inside “loop” in DX IL parlance keyworks… even more I compiled down to AMD 5850 ISA using AMD GPU shader analyzer and yes barriers is there between control flow…
2. Even in D3D 11 don’t allowed it, lots of basic GPGPU algorithms are fully optimized in this conditions like sorts, perhaps even scans and many more I think searching CUDA code over web will grasp the importance of this support in CUDA…
3. I have found that AMD is going to allow it! well details follow…
by using some leaked AMD drivers (9.00) and headers about it I found about AMD_compute_shader in fglrx drivers and seeing inside driver binary searching which keywords are implemented from AMD_compute_shader and which ones from ARB_compute_shader I have compiled successfully a compute shader having barrier inside control flow impossible to remove (the control flow) from the compiler… I can post full code and binary to check if you dont’ believe…

*Allow selective unroll of loops similar to D3D [unroll] (i.e. better control over #pragma optionNV(unroll all))
Motivation:
Another question is well matmul code after all has the loop with fixed iters (assuming tuned for a specific matrix dimensions) so we can unroll I have found using #pragma optionNV(unroll all) solves the problem and compiles the kernel BUT produces big binary so question there is a pragma in NV compiler to allow unrolling of selected loops similar to D3D [unroll]… hey that’s another place D3D11 CS are better than OpenGL… with that support I could unroll outer loop of matmul code posted before and the loop between barriers don’t be unrolled…

I have found using GL_NV_gpu_shader5 works perfectly… and allows using without less massage more cuda codes… I found happily that using OpenCL codes (CUDA code also use it) that pass shared mem pointers as function args compile correctly up to assembly compile that founds
something like a LDS.? line and returns with error…
So I have hope with your comment “Although use of pointers on shared memory currently won’t be possible” that this will be possible in the future… Please allow that use case!

First I think that comment about not currently possible to use codes that have barriers with control flow is correctly to be posted here because if not allowed in spec it’s here the place to motivate Nvidia to relax this restriction

Presumably by releasing an extension, right? Because we don’t want anyone, AMD, NVIDIA, Intel, etc just ignoring what the spec says because that’s not what it is that they want. Like your AMD_compute_shader example is a separate specification, thus allowing AMD to expose whatever functionality they choose.

Alfonse wouldn’t be better to say hey it’s a bug in spec definition after all is some days old and only implementd by Nvidia right now… and we have two options:

  1. spec doesn’t prohibite barriers in control flow it’s implementation dependant…
  2. spec supports barriers inside control flow after all all D3D11 HW and even OpenCL HW allow it…

NVIDIA doesn’t control the OpenGL specification. This would be a proposal for the ARB, not for NVIDIA.

“barrier();” isn’t allowed in control flow but the new barriers like “groupMemoryBarrier();” (which would be the equivalent right?) are allowed I think and i successfully used those with the Nvidia Driver (see line 269 here https://github.com/progschj/OpenGL-Examples/blob/master/experimental/XXcompute_shader_nbody.cpp). Actually the way I read it ogl even has global barriers (?) which aren’t even present in for example opencl.

Edit: while I’m at it. One i think obvious bug with the 4.3 driver I found was that according to spec “gl_WorkGroupSize” is supposed to be const so it can be used as array size. In 7.1 of the spec it says:

The built-in constant gl_WorkGroupSize is a compute-shader constant containing the local work-group
size of the shader. The size of the work group in the X, Y, and Z dimensions is stored in the x, y, and z
components. The values stored in gl_WorkGroupSize match those specified in the required local_size_x,
local_size_y, and local_size_z layout qualifiers for the current shader. This value is constant so that it can
be used to size arrays of memory that can be shared within the local work group.

but when I try exactely that: “shared vec4 tmp[gl_WorkGroupSize.x];” (for example on line 258 of the example linked above)
it errors with: “0(6) : error C1307: non constant expression for array size”

Also shared memory seems somehow broken I have this example: aI4WL - Online C++ Compiler & Debugging Tool - Ideone.com
Essentially it tries to first fill a shared array with the numbers of 0-256 and then write those out to memory. But it appears that line 14 writes to a “wrong offset” partially overwriting the first half of the local array.

Hmm, the forum ate my first post. Lets try again:

About the barriers: “barrier();” does indeed not work in flow control which is in accordance with the specs. But the other barriers such as groupMemoryBarrier etc. do. See for example line 268 here: https://github.com/progschj/OpenGL-Examples/blob/master/experimental/XXcompute_shader_nbody.cpp

There are two issues I found which I think ar bugs in the driver.

The first one being that the values in gl_WorkGroupSize are not constant it seems.
Trying something like “shared float tmp[gl_WorkGroupSize.x];”
results in a shader compiler error: “error C1307: non constant expression for array size”
But the spec explicitly states that WorkGroupSize is constant for exactly that purpose (in 7.1, page 112 of the not annotated version):

The built-in constant gl_WorkGroupSize is a compute-shader constant containing the local work-group
size of the shader. The size of the work group in the X, Y, and Z dimensions is stored in the x, y, and z
components. The values stored in gl_WorkGroupSize match those specified in the required local_size_x,
local_size_y, and local_size_z layout qualifiers for the current shader. This value is constant so that it can
be used to size arrays of memory that can be shared within the local work group
.

The other is that indexing/writing to shared memory seems “off”. I broke down my example to the following shader:


        #version 430
        layout(local_size_x=128) in;
        
        layout(r32f, location = 0) uniform imageBuffer data;
        
        shared float local[256];
        void main() {
           int N = imageSize(data);
           int index = int(gl_GlobalInvocationID);
           int localindex = int(gl_LocalInvocationIndex);

           local[localindex] = localindex;
           local[localindex+128] = localindex+128;

           groupMemoryBarrier();
           imageStore(data, index, vec4(local[localindex]));
           imageStore(data, index+128, vec4(local[localindex+128]));
        }

which I run a single group of. The expected result is that the imageBuffer gets filled with the values 0…255. But for some reason the result is this:


    0    1    2    3    4    5    6    7  128  129  130  131  132  133  134  135
  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151
  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183
  184  185  186  187  188  189  190  191  192  193  194  195  196  197  198  199
  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215
  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230  231
  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247
  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143
  144  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159
  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175
  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191
  192  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207
  208  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223
  224  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239
  240  241  242  243  244  245  246  247  248  249  250  251  252  253  254  255

so the first half gets partially overwritten with the second somehow. But the overwriting starts at an offset of 8 elements…
And if I change apperances of “local[localindex+128]” to “local[localindex+2*128]” it starts at 16 elements etc. so its like the +128 somehow gets added in with a wrong factor?

Here is the full test case:


#include <GL3/gl3w.h>
#include <GL/glfw.h>

#include <iostream>
#include <iomanip>
#include <algorithm>
#include <string>
#include <vector>
#include <cstdlib>
#include <cmath>

bool running;

// window close callback function
int closedWindow()
{
    running = false;
    return GL_TRUE;
}

// helper to check and display for shader compiler errors
bool check_shader_compile_status(GLuint obj)
{
    GLint status;
    glGetShaderiv(obj, GL_COMPILE_STATUS, &status);
    if(status == GL_FALSE)
    {
        GLint length;
        glGetShaderiv(obj, GL_INFO_LOG_LENGTH, &length);
        std::vector<char> log(length);
        glGetShaderInfoLog(obj, length, &length, &log[0]);
        std::cerr << &log[0];
        return false;
    }
    return true;
}

// helper to check and display for shader linker error
bool check_program_link_status(GLuint obj)
{
    GLint status;
    glGetProgramiv(obj, GL_LINK_STATUS, &status);
    if(status == GL_FALSE)
    {
        GLint length;
        glGetProgramiv(obj, GL_INFO_LOG_LENGTH, &length);
        std::vector<char> log(length);
        glGetProgramInfoLog(obj, length, &length, &log[0]);
        std::cerr << &log[0];
        return false;   
    }
    return true;
}

int main()
{
    int width = 640;
    int height = 480;
    
    if(glfwInit() == GL_FALSE)
    {
        std::cerr << "failed to init GLFW" << std::endl;
        return 1;
    } 
    glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 4);
    glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 3);
 
    // create a window
    if(glfwOpenWindow(width, height, 0, 0, 0, 8, 24, 8, GLFW_WINDOW) == GL_FALSE)
    {
        std::cerr << "failed to open window" << std::endl;
        glfwTerminate();
        return 1;
    }
    
    // setup windows close callback
    glfwSetWindowCloseCallback(closedWindow);
    
    glfwSwapInterval(0);
    
    if (gl3wInit())
    {
        std::cerr << "failed to init GL3W" << std::endl;
        glfwCloseWindow();
        glfwTerminate();
        return 1;
    }
    const char *source;
    int length;
        
    std::string test_source =
        "#version 430
"
        "layout(local_size_x=128) in;
"
        
        "layout(r32f, location = 0) uniform imageBuffer data;
"
        
        "shared float local[256];
"
        "void main() {
"
        "   int N = imageSize(data);
"
        "   int index = int(gl_GlobalInvocationID);
"
        "   int localindex = int(gl_LocalInvocationIndex);
"

        "   local[localindex] = localindex;
"
        "   local[localindex+128] = localindex+128;
"

        "   groupMemoryBarrier();
"
        "   imageStore(data, index, vec4(local[localindex]));
"
        "   imageStore(data, index+128, vec4(local[localindex+128]));
"
        "}
";
   
    // program and shader handles
    GLuint test_program, test_shader;

    // create and compiler vertex shader
    test_shader = glCreateShader(GL_COMPUTE_SHADER);
    source = test_source.c_str();
    length = test_source.size();
    glShaderSource(test_shader, 1, &source, &length); 
    glCompileShader(test_shader);
    if(!check_shader_compile_status(test_shader))
    {
        return 1;
    }

    // create program
    test_program = glCreateProgram();
    
    // attach shaders
    glAttachShader(test_program, test_shader);
   
    // link the program and check for errors
    glLinkProgram(test_program);
    check_program_link_status(test_program);   


    std::vector<float> data(256);
    //~ std::generate(data.begin(), data.end(), randf);
    std::fill(data.begin(), data.end(), 1.0f);
    
    for(int i = 0;i<256;++i)
    {
        data[i] = -1;
    }
    
    GLuint buffer;
    
    glGenBuffers(1, &buffer);
    glBindBuffer(GL_TEXTURE_BUFFER, buffer);
    glBufferData(GL_TEXTURE_BUFFER, sizeof(float)*data.size(),  &data[0], GL_STATIC_DRAW);                  


    // texture handle
    GLuint buffer_texture;
    
    glGenTextures(1, &buffer_texture);
    glBindTexture(GL_TEXTURE_BUFFER, buffer_texture);
    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);

    // bind images
    glBindImageTexture(0, buffer_texture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);
    
    glUseProgram(test_program);
    glUniform1i(0, 0);

    glDispatchCompute(1, 1, 1);
    
    glGetBufferSubData(GL_TEXTURE_BUFFER, 0, sizeof(float)*data.size(), &data[0]);
    
    for(size_t i = 0;i<data.size();i+=1)
    {
        if(i%16==0) std::cout << std::endl;
        std::cout << std::setw(5) << data[i];
    }
    std::cout << std::endl;
    
    GLint shared_size;
    glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &shared_size);
    std::cout << "max shared: " << shared_size << std::endl;
    
    glfwCloseWindow();
    glfwTerminate();
    return 0;
}

Edit:
I did some more research into the second thing and was wondering if instead of writing “localindex+128” i could precompute a localindex128 and use that since that would get rid of the inlined +128 that I suspected to be added incorrectly. At first that didn’t change anything which I then reallized doesn’t mean much since the compiler might inline my localindex128 during static analysis. So I needed a way to write “+128” without the compiler going nuts with optimization. First I tried making localindex128 volatile, which didn’t work. Then I thought “well all I need is a constant 128 the compiler thinks isn’t constant”… See where this is going? :smiley: So i used gl_WorkGroupSize.x from the first issue and ended up with this:


        #version 430
        layout(local_size_x=128) in;
        
        layout(r32f, location = 0) uniform imageBuffer data;
        
        shared float local[256];
        void main() {
           int N = imageSize(data);
           int index = int(gl_GlobalInvocationID);
           int localindex = int(gl_LocalInvocationIndex);
           int localindex128 = int(gl_LocalInvocationIndex+gl_WorkGroupSize.x); //gl_WorkGroupSize.x is 128 but because of the first bug the compiler doesn't know it's constant

           local[localindex] = localindex;
           local[localindex128] = localindex128;

           groupMemoryBarrier();
           imageStore(data, index, vec4(local[localindex]));
           imageStore(data, index+128, vec4(local[localindex128]));
        }

which gives the expected result. I guess that is about as far as I can narrow it down from my end.

Jakob,

Thanks for the feedback.

I agree that the first issue (not treating gl_WorkGroupSize as a constant expression) looks like a driver compiler bug, where it is treating it as an “in” instead of a “const”. Hopefully, this should be easy to fix.

I’ll have to look at the second issue in more detail. One thing that looks wrong about that shader is that there is no barrier() call between the shared memory stores and loads. There are two types of barriers with different purposes:

  • groupMemoryBarrier() ensures that memory transactions are flushed so other threads can see them
  • barrier() ensure that all threads have finished their stores before we continue

Typically, you need both for safety. It’s not clear to me that the lack of a barrier() call in your shader has anything to do with the problem here, because each thread appears to read only shared memory values written by its own thread (and not touched by any other thread).

Yes, that’s correct, though it would certainly be possible to create a NVIDIA extension to GLSL that relaxes this restriction.

The limitation on barrier() is inherited from tessellation control shaders. The intent of the restriction is to prevent you from writing shaders that will hang. For example, in the following code:

if (divergent_conditional_expression) {
  barrier();
}

the threads where the expression is true will call barrier() and then wait around for the other threads. But the other threads might not call barrier() at all.

This restriction wasn’t a big deal for tessellation control shaders, as the places where you want barrier() calls are typically very limited and in well-defined places (i.e., before computing outer or inner tessellation levels). This is a bigger issue for compute shaders because there are algorithms where you may want to run multiple phases of computation in a loop, which would naturally result in barrier() calls inside the loop.

There’s a few options here, none of which are perfect:

(1) Just allow barrier() anywhere, making hangs possible.

(2) Allow barrier() in more places, but still have some limitations to avoid hangs. For example, allow it in loops with uniform flow control (e.g., uniform start/end points, no conditional “break”, no conditional “continue” before the barrier). This will be fairly tricky to specify and implement.

(3) Leave it as-is.

I’ve already filed a Khronos bug report on this issue, but it was too late for GLSL 4.30.

Jakob,

[QUOTE=pbrown;1241867]
I’ll have to look at the second issue in more detail. One thing that looks wrong about that shader is that there is no barrier() call between the shared memory stores and loads. [/QUOTE]

I think I’ve root-caused it. It should be easy to fix.

It has nothing to do with the absence of barrier() in your shader, though you will need barrier() calls for more complex shared memory usage as I noted in my previous comment.

Yep, I just reread the that section of the specs and also realized that this means my tiled nbody kernel I ported from CUDA is incorrect (and can’t be fixed since it would need the barrier in the flow control… :frowning: ). Thanks for looking into these.

Hi Pat,
Is too much too ask for NV removing this check (barriers in control flow) in assembly compute shader code so we can start playing with advanced compute codes that require it?
can we expect some solution before GL 4.3 gets into mainline drivers i.e. in next 4.3 beta drivers?

Really I tried also to patch binary shaders but binaries doesn’t seem similar to CUDA binaries…

hope NV implements either solution #1 or #2
as said seems #1 is what D3D Compute and CUDA and OpenCL allow… i.e. no restrictions and programmers are responsible for “good” code without hangs…
if you want to avoid hangs realistically solution #2 can be enough for just algorithms that require control flow…
I’m waiting for Nvidia to lift that restriction soon either way for broad testing of some compute codes…

Thanks for detailed info on the issue!

[QUOTE=pbrown;1241869]Yes, that’s correct, though it would certainly be possible to create a NVIDIA extension to GLSL that relaxes this restriction.

The limitation on barrier() is inherited from tessellation control shaders. The intent of the restriction is to prevent you from writing shaders that will hang. For example, in the following code:

if (divergent_conditional_expression) {
  barrier();
}

the threads where the expression is true will call barrier() and then wait around for the other threads. But the other threads might not call barrier() at all.

This restriction wasn’t a big deal for tessellation control shaders, as the places where you want barrier() calls are typically very limited and in well-defined places (i.e., before computing outer or inner tessellation levels). This is a bigger issue for compute shaders because there are algorithms where you may want to run multiple phases of computation in a loop, which would naturally result in barrier() calls inside the loop.

There’s a few options here, none of which are perfect:

(1) Just allow barrier() anywhere, making hangs possible.

(2) Allow barrier() in more places, but still have some limitations to avoid hangs. For example, allow it in loops with uniform flow control (e.g., uniform start/end points, no conditional “break”, no conditional “continue” before the barrier). This will be fairly tricky to specify and implement.

(3) Leave it as-is.

I’ve already filed a Khronos bug report on this issue, but it was too late for GLSL 4.30.[/QUOTE]

Think I’ve run into a bug in the Linux driver. I reduced a misbehaving parallel scan to this minimal test case:


// ARB_compute_shader shared[] array test case.
//
// When using arrays of shared variables, index expressions involving
// gl_LocalInvocationID lead to stores not being observed by subsequent
// loads to the same location.

#include <GL/glew.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __linux__
#  include <GL/glx.h>
#  include <X11/Xlib.h>
#endif
#ifdef _WIN32
#  include <Windows.h>
#endif

#define mChkGL(X) X; CheckGLError(__LINE__);

const GLchar *ComputeCode =
  // This shader should write 1 to the second element of sharedBuf,
  // then read that value back and write it to OutputBuffer.
  // There is 1 work group of size 1. The code is guarded so that
  // only gl_LocationInvocationID.x == 0 executes the test.
  "#version 430 core                                                   
"
  "                                                                    
"
  "layout(std430, binding = 0) buffer Output { int OutputBuffer[1]; }; 
"
  "shared int sharedBuf[2];                                            
"
  "                                                                    
"
  "layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;    
"
  "void main() {                                                       
"
  "  if(gl_LocalInvocationID.x == 0) {                                 
"
  "    /* Initialize second element with 0. */                         
"
  "    sharedBuf[1] = 0;                                               
"
  "                                                                    
"
// Uncomment the following line to make validation pass.
//  "    sharedBuf[1] = 1;                                               
"
  "                                                                    
"
  "    /* This store to the second element is not seen by the load. */ 
"
  "    sharedBuf[1 + gl_LocalInvocationID.x] = 1;                      
"
  "                                                                    
"
  "    /* Copy second element out for validation. */                   
"
  "    OutputBuffer[0] = sharedBuf[1];                                 
"
  "  }                                                                 
"
  "}                                                                   
"
;

void CheckGLError(int line) {
  GLint glErr = glGetError();

  if(glErr != GL_NO_ERROR) {
    printf("OpenGL error %d at line %d
", glErr, line);
    exit(1);
  }
}

int main() {
  // Minimal OpenGL context setup.
#ifdef __linux__
  Display *display = XOpenDisplay(NULL);
  Window window = XCreateSimpleWindow(display, DefaultRootWindow(display), 0, 0, 1, 1, 0, 0, 0);
  int visual[] = { GLX_RGBA, 0 };
  XVisualInfo *vInfo = glXChooseVisual(display, DefaultScreen(display), visual);
  GLXContext glCtx = glXCreateContext(display, vInfo, NULL, 1);
  glXMakeCurrent(display, window, glCtx);
#endif
#ifdef _WIN32
  HWND window = CreateWindow(L"edit", 0, 0, 0, 0, 1, 1, NULL, NULL, NULL, NULL);
  HDC dc = GetDC(window);
  PIXELFORMATDESCRIPTOR pfd;
  memset(&pfd, 0, sizeof(pfd));
  pfd.dwFlags = PFD_SUPPORT_OPENGL;
  SetPixelFormat(dc, ChoosePixelFormat(dc, &pfd), &pfd);
  HGLRC glCtx = wglCreateContext(dc);
  wglMakeCurrent(dc, glCtx);
#endif

  glewInit();

  // Allocate buffer for both programs.
  GLuint buffer;
  mChkGL(glGenBuffers(1, &buffer));
  mChkGL(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer));
  mChkGL(glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(int), NULL, GL_DYNAMIC_DRAW));

  // Build compute shader.
  GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
  mChkGL(glShaderSource(shader, 1, &ComputeCode, NULL));
  mChkGL(glCompileShader(shader));

  GLint compileLogSize;
  mChkGL(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &compileLogSize));

  if(compileLogSize > 0) {
    char *compileLog = new char[compileLogSize];
    mChkGL(glGetShaderInfoLog(shader, compileLogSize, NULL, compileLog));
    printf("%s", compileLog);
  }

  // Build compute program.
  GLuint program = glCreateProgram();
  mChkGL(glAttachShader(program, shader));
  mChkGL(glLinkProgram(program));

  GLint linkLogSize;
  mChkGL(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &linkLogSize));

  if(linkLogSize > 0) {
    char *linkLog = new char[linkLogSize];
    mChkGL(glGetProgramInfoLog(program, linkLogSize, NULL, linkLog));
    printf("%s", linkLog);
  }

  // Invoke compute program and check result.
  mChkGL(glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_INT, NULL));
  mChkGL(glUseProgram(program));
  mChkGL(glDispatchCompute(1, 1, 1));
  mChkGL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));

  int *bufferData = (int *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
  int bufferVal = *bufferData;
  mChkGL(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));

  printf("Validation %s
", (bufferVal == 1) ? "PASSED" : "FAILED");
}

The shader store to sharedBuf[1 + gl_LocalInvocationID.x] (where local id is 0) is not observed by the subsequent load from sharedBuf[1].

This is running on a GTX 470. The test passes fine with the Windows driver.