Not all invocations running

I am attempting to implement a simple compute operation of multiplying all values in a buffer by a push constant.

This shader:

#version 450
layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) buffer Buffer { float x[]; };
layout(push_constant) uniform PushConsts { float a; };

void main() {
    x[gl_GlobalInvocationID.x] *= a;
}

In my current implementation from my prints I get:

in:
1 2 3 4 5 5 4 3 2 1
[7]

workgroups: (1,1,1)

out:
7 14 3 4 5 5 4 3 2 1

1 2 3 4 5 5 4 3 2 1 being the initial values of the buffer, 7 being the value of the push constant, (1,1,1) being the number of workgroups set and 7 14 3 4 5 5 4 3 2 1 being the resultant values of the buffer after execution.
Strangely only the 1st 2 values have been multiplied by the push constant (which leads me to think only the 1st 2 invocations ran).
I suspect this is likely an issue with memory allocation, although I don’t know where it may be coming from.

I would greatly appreciate any help, and if there is anything I can do to make this post better please let me know.

Here’s the project link

Hello there,

RenderDoc is an awesome graphics debugger which also allows you to debug shaders (including
compute shaders) as far as I know.

Try launching your application directly from RenderDoc and take a snapshot (F12 or print key). RenderDoc can only catch it if it runs every frame though. An alternative would be to launch your app in RenderDoc with the option Queue capture: frame 0 enabled:

Always keep RenderDoc in mind when debugging a Vulkan application. It’s in my opinion the most powerful debugging tool for the API.

best regards,
Johannes

So running RenderDoc and launching the application as such:

Launches, runs and closes the application. What should I be looking for here?

Hi there
Does your program actually render anything?

To be honest I am not sure if RenderDoc works if the application does not render anything at all =(.

Hmm maybe this article gives you some hints what’s wrong:

best regards
Johannes

Can you add the relevant parts for your compute? How you dispatch und how you copy and/or sync the read-back.

As for RenderDoc: If you’re doing headless, you can use RenderDocs API, see e.g. https://github.com/SaschaWillems/Vulkan-RenderDoc-API/blob/master/src/main.cpp

Will do some more looking into RenderDoc and the API.

I think these are the parts you are asking about right?

Pipeline (line 533 in project):

void createComputePipeline(
    VkDevice& device,
    char* shaderFile,
    VkShaderModule* computeShaderModule,
    VkDescriptorSetLayout* descriptorSetLayout,
    VkPipelineLayout* pipelineLayout,
    VkPipeline* pipeline,
    float const* pushConstants,
    uint32_t numPushConstants
) {
    // Creates shader module (just a wrapper around our shader)
    VkShaderModuleCreateInfo createInfo = {};
    {
        uint32_t filelength;
        createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
        createInfo.pCode = readFile(filelength, shaderFile);
        createInfo.codeSize = filelength;
    }

    VK_CHECK_RESULT(vkCreateShaderModule(device, &createInfo, NULL, computeShaderModule));

    VkPushConstantRange push_constant;
    {
        push_constant.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
        push_constant.offset = 0;
        push_constant.size = numPushConstants * sizeof(float);
    }

    // The pipeline layout allows the pipeline to access descriptor sets. 
    // So we just specify the descriptor set layout we created earlier.
    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {};
    {
        pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
        pipelineLayoutCreateInfo.setLayoutCount = 1; // 1 shader
        pipelineLayoutCreateInfo.pSetLayouts = descriptorSetLayout; // Descriptor set
        pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
        pipelineLayoutCreateInfo.pPushConstantRanges = &push_constant;
    }
        
    VK_CHECK_RESULT(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, NULL, pipelineLayout));

    // Set our pipeline options
    VkComputePipelineCreateInfo pipelineCreateInfo = {};
    {
        // We specify the compute shader stage, and it's entry point(main).
        VkPipelineShaderStageCreateInfo shaderStageCreateInfo = {};
        {
            shaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
            shaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; // Shader type
            shaderStageCreateInfo.module = *computeShaderModule; // Shader module
            shaderStageCreateInfo.pName = "main"; // Shader entry point
        }
        // We set our pipeline options
        pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
        pipelineCreateInfo.stage = shaderStageCreateInfo; // Shader stage info
        pipelineCreateInfo.layout = *pipelineLayout;
    }
        

    // Create compute pipeline
    VK_CHECK_RESULT(vkCreateComputePipelines(
        device, VK_NULL_HANDLE,
        1, &pipelineCreateInfo,
        NULL, pipeline));
}

Dispatch (line 605 in project):

void createCommandBuffer(
    uint32_t queueFamilyIndex,
    VkDevice& device,
    VkCommandPool* commandPool,
    VkCommandBuffer* commandBuffer,
    VkPipeline& pipeline,
    VkPipelineLayout& pipelineLayout,
    float const* pushConstants,
    uint32_t numPushConstants,
    int const* dims, // [x,y,z],
    int const* dimLengths // [local_size_x, local_size_y, local_size_z]
) {
    // Creates command pool
    VkCommandPoolCreateInfo commandPoolCreateInfo = {};
    {
        commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
        commandPoolCreateInfo.flags = 0;
        // Sets queue family
        commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex;
    }
    VK_CHECK_RESULT(vkCreateCommandPool(device, &commandPoolCreateInfo, NULL, commandPool));

    //  Allocates command buffer
    VkCommandBufferAllocateInfo commandBufferAllocateInfo = {};
    {
        commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
        commandBufferAllocateInfo.commandPool = *commandPool; // Pool to allocate from
        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
        commandBufferAllocateInfo.commandBufferCount = 1; // Allocates 1 command buffer. 
    }
    VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &commandBufferAllocateInfo, commandBuffer)); // allocate command buffer.

    // Allocated command buffer options
    VkCommandBufferBeginInfo beginInfo = {};
    {
        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
        // Buffer only submitted once
        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
    }
    // Start recording commands
    VK_CHECK_RESULT(vkBeginCommandBuffer(*commandBuffer, &beginInfo));

    // Binds pipeline (our functions)
    vkCmdBindPipeline(*commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
    // Binds descriptor set (our data)
    vkCmdBindDescriptorSets(*commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 1, &descriptorSet, 0, NULL);
        
    // Sets push constants
    vkCmdPushConstants(*commandBuffer, pipelineLayout,VK_SHADER_STAGE_COMPUTE_BIT,0, numPushConstants*sizeof(float), pushConstants);

    std::cout << "workgroups: " << '(' <<
        (uint32_t)ceil(dims[0] / (float)dimLengths[0]) << ',' <<
        (uint32_t)ceil(dims[1] / (float)dimLengths[1]) << ',' <<
        (uint32_t)ceil(dims[2] / (float)dimLengths[2]) << ')' << 
        std::endl << std::endl;

    // Sets invocations
    vkCmdDispatch(
        *commandBuffer,
        (uint32_t) ceil(dims[0] / (float) dimLengths[0]),
        (uint32_t) ceil(dims[1] / (float) dimLengths[1]),
        (uint32_t) ceil(dims[2] / (float) dimLengths[2])
    );

    // End recording commands
    VK_CHECK_RESULT(vkEndCommandBuffer(*commandBuffer));
}

Running (line 674 in project):

void runCommandBuffer(
    VkCommandBuffer* commandBuffer,
    VkDevice& device,
    VkQueue& queue
) {
    VkSubmitInfo submitInfo = {};
    {
        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        // submit 1 command buffer
        submitInfo.commandBufferCount = 1;
        // pointer to array of command buffers to submit
        submitInfo.pCommandBuffers = commandBuffer;
    }

    // Creates fence (so we can await for command buffer to finish)
    VkFence fence;
    VkFenceCreateInfo fenceCreateInfo = {};
    {
        fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
        fenceCreateInfo.flags = 0;
    }
    VK_CHECK_RESULT(vkCreateFence(device, &fenceCreateInfo, NULL, &fence));

    // Submit command buffer with fence
    VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, fence));

    // Wait for fence to signal (which it does when command buffer has finished)
    VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, 100000000000));

    // Destructs fence
    vkDestroyFence(device, fence, NULL);
}

Reading (line 708 in project):

void printOutput(
    VkDevice& device,
    VkDeviceMemory& bufferMemory,
    uint32_t size
) {
    void* data = nullptr;
    vkMapMemory(device, bufferMemory, 0, VK_WHOLE_SIZE, 0, &data);
    float* actualData = (float*)data;
    std::cout << "out:" << std::endl;
    std::cout << '\t';
    for (int i = 0; i < size; ++i) {
        std::cout << actualData[i] << ' ';
    }
    std::cout << std::endl;
    vkUnmapMemory(device, bufferMemory);
}

Problem originated with setting the size of the VkBuffer and VkDescriptorBufferInfo to the number of values rather than the number of bytes.

Making these changes fixes it:

  • bufferCreateInfo.size = size; -> bufferCreateInfo.size = sizeof(float)*size;
  • bindings[i].range = size; -> bindings[i].range = VK_WHOLE_SIZE;