Memory synchronisation when iterating compute shaders

The code attached to this post is an abstracted form of a method I want to use to run intersection tests for a ray tracer. Due to memory limitations I need to test just small groups of rays at a time so I am trying to iteratively allocate sub-arrays of data to SSBOs to pass through the shaders then compile into the full vertex arrays at the end. I have confirmed that the data sizes all line up and no errors occur, but am having trouble with every loop other than the first resulting in just 0s in the “intCheck” vector I am using as a placeholder for the relevant outputs. I suspect the issue is something to do with memory synchronisation in subsequent loops but simple fixes like using glMemoryBarrier() or a fence haven’t worked and I’m not clear on where to go from there. If anyone can see the fix to this that would be a huge help :slight_smile:

C++:

void model::rayTrace(const char* triCheckFile, const char* closestFile, vector<glm::vec4>* origins, vector<glm::vec4>* directions, int numBounces, glm::vec3 rgb) {

    numRays = origins->size();

    vecCheck.resize(numRays * totalInd / 3.0);
    intCheck.resize(numRays * totalInd / 3.0);

    for (int i = 0; i < numRays * totalInd / 3.0; i++) {
        vecCheck[i] = glm::vec4(i % 12);
        intCheck[i] = 0.0;
    }

    int raysPerCall = min(numRays, int(availableChecks / (totalInd / 3.0)));
    int numCalls = numRays / int(raysPerCall) + 1;

    triCheckShader.setUp(triCheckFile);
    triCheckShader.Activate();

    glGenBuffers(1, &vecCheckB);
    glGenBuffers(1, &intCheckB);

    int raysDone = 0; // numRays in this call
    int isLast; // check if final call
    for (int i = 0; i < numCalls; i++) {
        int rayPack = 0;
        int diff = 0;
        if (i != numCalls - 1) {
            rayPack = raysPerCall;
            isLast = 0;
            cout << "1 - rayPack: " << rayPack << endl;
        }
        else {
            rayPack = numRays - raysDone;
            diff = raysPerCall - rayPack;
            isLast = 1;
            cout << "2 - rayPack: " << rayPack << endl;
        }

        // declare subVectors for relevant ranges of vecCheck and intCheck
        int startRay = i * raysPerCall;
        cout << "startRay:      " << startRay << endl;
        int endRay = (i + 1) * raysPerCall - isLast * diff;
        cout << "endRay:        " << endRay << endl;
        vector<glm::vec4> subVecCheck(vecCheck.begin() + startRay * (totalInd / 3.0), vecCheck.begin() + endRay * (totalInd / 3.0));
        vector<float> subIntCheck(intCheck.begin() + startRay * (totalInd / 3.0), intCheck.begin() + endRay * (totalInd / 3.0));

        glBindBuffer(GL_SHADER_STORAGE_BUFFER, vecCheckB);
        glBufferData(GL_SHADER_STORAGE_BUFFER, rayPack * (totalInd / 3.0) * sizeof(glm::vec4), subVecCheck.data(), GL_DYNAMIC_COPY);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 13, vecCheckB);

        glBindBuffer(GL_SHADER_STORAGE_BUFFER, intCheckB);
        glBufferData(GL_SHADER_STORAGE_BUFFER, rayPack * (totalInd / 3.0) * sizeof(float), subIntCheck.data(), GL_DYNAMIC_COPY);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 14, intCheckB);

        // dispatchSize.x = raysPerCall  |  dispatchSize.y = totalInd / 3.0
        glUniform1i(glGetUniformLocation(triCheckShader.ID, "numRays"), rayPack);
        glUniform1i(glGetUniformLocation(triCheckShader.ID, "faces"), totalInd / 3.0);
        glDispatchCompute(rayPack, totalInd / 3, 1);
        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

        glBindBuffer(GL_SHADER_STORAGE_BUFFER, vecCheckB);
        glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (rayPack* totalInd / 3) * sizeof(glm::vec4), subVecCheck.data());
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, intCheckB);
        glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (rayPack* totalInd / 3) * sizeof(float), subIntCheck.data());

        for (int j = 0; j < rayPack * totalInd / 3.0; j++) {
            intCheck[raysDone + j] = subIntCheck[j];
        }

        raysDone += rayPack;
    }

    for (int i = 0; i < numRays * totalInd / 3.0; i++) {
        cout << i << ": " << intCheck[i] << endl;
    }
}

Compute Shader:

#version 460 core

uniform int numRays;
uniform int faces;

layout(std430, binding = 13) buffer vecCheck {
    vec4 vecOut[];
};
layout(std430, binding = 14) buffer intCheck {
    float intOut[ ];
};

layout(local_size_x = 1) in;
layout(local_size_y = 1) in;
layout(local_size_z = 1) in;

int rayIndex;
int faceIndex;
int globalIndex;

void main() {
    rayIndex = int(gl_WorkGroupID.x);
    faceIndex = int(gl_WorkGroupID.y);
    globalIndex = int(gl_WorkGroupID.x) * faces + int(gl_WorkGroupID.y);

    intOut[globalIndex] = vecOut[globalIndex].x + vecOut[globalIndex].y + vecOut[globalIndex].z;

    barrier();
}