vkCmdDrawIndexedIndirect error on Nvidia, works fine on Intel


#1

I am getting a VK_ERROR_DEVICE_LOST return code in my queue submit command when using vkCmdDrawIndexedIndirect(), with Nvidia hardware only. It works perfectly fine with Intel graphics.

It also works fine if I use vkCmdDrawIndexed for each batch, one by one, on both Nvidia and Intel.

The device->SetBufferData method uses a staging buffer that is associated with the command buffer to transfer data to the indirect buffer.

This has a lot of my own wrapper code in it, but you can see some of what is going on. Can you see anything wrong with this?:

bool VkuContext::RecordFrame(const int currentFrame)
{
	commandbuffers[currentFrame]->Reset();
	refreshcommandbuffer[currentFrame] = false;

	//---------------------------------------------------------------
	// Draw commands
	//---------------------------------------------------------------

	// Build draw structures
	uint32_t firstInstance = 0;
	VkDrawIndexedIndirectCommand command;
	for (auto mesh : queuedmeshes)
	{
		if (mesh->indiceCount != 0 and mesh->instanceCount != 0)
		{
			command.indexCount = mesh->indiceCount;
			command.vertexOffset = mesh->vertexoffset;
			command.firstInstance = firstInstance;
			command.firstIndex = mesh->indiceoffset;
			command.instanceCount = mesh->instanceCount;
			commandbuffers[currentFrame]->drawcommands.push_back(command);
		}
		else
		{
			Assert(0);
		}
		firstInstance += mesh->instanceCount;
	}

	// Create buffer and send data
	if (!commandbuffers[currentFrame]->drawcommands.empty())
	{
		auto buffersz = commandbuffers[currentFrame]->drawcommands.size() * sizeof(VkDrawIndexedIndirectCommand);
		if (commandbuffers[currentFrame]->indirectcommandbuffer != nullptr)
		{
			if (buffersz > commandbuffers[currentFrame]->indirectcommandbuffer->GetSize())
			{
				commandbuffers[currentFrame]->indirectcommandbuffer = nullptr;
			}
		}
		if (commandbuffers[currentFrame]->indirectcommandbuffer == nullptr)
		{
			commandbuffers[currentFrame]->indirectcommandbuffer = make_shared<GPUBuffer>(device);
			commandbuffers[currentFrame]->indirectcommandbuffer->Initialize(buffersz, VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VMA_MEMORY_USAGE_GPU_ONLY);
		}
		device->SetBufferData(commandbuffers[currentFrame]->indirectcommandbuffer, &commandbuffers[currentFrame]->drawcommands[0], buffersz, false);
		commandbuffers[currentFrame]->BindResource(commandbuffers[currentFrame]->indirectcommandbuffer);
	}

	//---------------------------------------------------------------
	// Send instance data
	//---------------------------------------------------------------

	// Keep storage buffers in scope until command buffer is reset
	commandbuffers[currentFrame]->boundresources.insert(commandbuffers[currentFrame]->boundresources.end(), device->storagebuffer.begin(), device->storagebuffer.end());

	//---------------------------------------------------------------
	// Begin recording
	//---------------------------------------------------------------
	
	commandbuffers[currentFrame]->Begin();
	
	//---------------------------------------------------------------
	// Begin render pass
	//---------------------------------------------------------------

	VkRenderPassBeginInfo renderPassBeginInfo = {};
	renderPassBeginInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
	renderPassBeginInfo.renderPass = renderPass;
	renderPassBeginInfo.framebuffer = framebuffers[currentFrame];
	renderPassBeginInfo.renderArea.offset = { 0, 0 };
	renderPassBeginInfo.renderArea.extent = chaininfo.imageExtent;
	VkClearValue clear = { clearColor[0], clearColor[1], clearColor[2], clearColor[3] };
	renderPassBeginInfo.clearValueCount = 1;
	renderPassBeginInfo.pClearValues = &clear;
	vkCmdBeginRenderPass(commandbuffers[currentFrame]->commandbuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);

	//---------------------------------------------------------------
	// Record commands
	//---------------------------------------------------------------

	// Bind pipeline
	vkCmdBindPipeline(commandbuffers[currentFrame]->commandbuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);

	// Bind storage buffers
	vkCmdBindDescriptorSets(commandbuffers[currentFrame]->commandbuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, environment->pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);

	// Push constants
	vkCmdPushConstants(commandbuffers[currentFrame]->commandbuffer, device->instance->pipelineLayout, VK_SHADER_STAGE_ALL, 0, sizeof(device->instance->shaderglobals), &device->instance->shaderglobals);

	// Bind vertex and index buffers
	vkCmdBindVertexBuffers(commandbuffers[currentFrame]->commandbuffer, 0, 1, &device->vertexbuffer->buffer, offsets);
	vkCmdBindIndexBuffer(commandbuffers[currentFrame]->commandbuffer, device->indicebuffer->buffer, 0, VK_INDEX_TYPE_UINT32);
	commandbuffers[currentFrame]->BindResource(device->vertexbuffer);
	commandbuffers[currentFrame]->BindResource(device->indicebuffer);

	// Draw
	if (!commandbuffers[currentFrame]->drawcommands.empty())
	{
		if (commandbuffers[currentFrame]->drawcommands.size() > device->limits.maxDrawIndirectCount)
		{
			// One batch at a time
			for (int n = 0; n < commandbuffers[currentFrame]->drawcommands.size(); ++n)
			{
				//vkCmdDrawIndexed(commandbuffers[currentFrame]->commandbuffer, commandbuffers[currentFrame]->drawcommands[n].indexCount, commandbuffers[currentFrame]->drawcommands[n].instanceCount, commandbuffers[currentFrame]->drawcommands[n].firstIndex, commandbuffers[currentFrame]->drawcommands[n].vertexOffset, commandbuffers[currentFrame]->drawcommands[n].firstInstance);
				vkCmdDrawIndexedIndirect(commandbuffers[currentFrame]->commandbuffer, commandbuffers[currentFrame]->indirectcommandbuffer->buffer, n * sizeof(VkDrawIndexedIndirectCommand), 1, sizeof(VkDrawIndexedIndirectCommand));
			}
		}
		else
		{
			//All at once
			vkCmdDrawIndexedIndirect(commandbuffers[currentFrame]->commandbuffer, commandbuffers[currentFrame]->indirectcommandbuffer->buffer, 0, commandbuffers[currentFrame]->drawcommands.size(), sizeof(VkDrawIndexedIndirectCommand));
		}
	}

	// Finish up
	commandbuffers[currentFrame]->EndRenderPass();
	commandbuffers[currentFrame]->End();
	
	return true;
}

#2

Does it also apply a barrier/event/something to ensure visibility of the transferred data? What validation layers are you using when testing the problem?


#3

I am using the “VK_LAYER_LUNARG_standard_validation” validation layer.

No fences are used for the buffer transfer, just one fence per command buffer to make sure it is completed. I map the data to a per-commandbuffer staging buffer, then insert a vkCmdCopyBuffer call into the queue to copy from the staging buffer to the indirect buffer. So staging buffer is mapped after the command buffer fence is cleared, and the copy from stage to indirect buffers only happens inside the command queue, so I think that should all be safe.


#4

Famous last words.

There is nothing in vkCmdCopyBuffer which ensures the execution order of this command relative to any subsequent commands, nor which ensures the visibility of any memory modified by this command to subsequent commands. And there is nothing in a render pass or an indirect dispatch operation which ensures the execution of any commands issued before that command or the visibility of any memory operations modified before that command.

Just putting stuff in the same queue or command buffer ensures nothing about execution or visibility. OK, “nothing” is an exaggeration, but it is only a slight exaggeration, and the case you describe does not apply.

With the exception of certain render pass operations with respect to attachments, and with the exception of blending and other framebuffer operations, anything which manipulates memory that gets consumed later needs some kind of explicit synchronization mechanism between them which ensures the execution and visibility.


#5

I assumed the commands would be executed in the order they are added to the queue. Can you point me in the right direction for the additional required functionality I need to implement?


#6

It looks like memory barriers are the next step.


#7

To my surprise, my first attempt worked perfectly. Here is my code:

	std::vector<VkBufferMemoryBarrier> memoryBarriers;
	VkBufferMemoryBarrier memoryBarrier;
	shared_ptr<GPUBuffer> buf;

	for (int n = 0; n < 4; ++n)
	{
		if (n == 3)
		{
			buf = commandbuffers[currentFrame]->indirectcommandbuffer;
			if (buf == nullptr) break;
		}
		else
		{
			buf = device->storagebuffer[n];
		}
		memoryBarrier.buffer = buf->buffer;
		memoryBarrier.dstQueueFamilyIndex = device->instance->familyindex;
		memoryBarrier.offset = 0;
		memoryBarrier.pNext = nullptr;
		memoryBarrier.size = buf->GetSize();
		memoryBarrier.srcQueueFamilyIndex = device->instance->familyindex;
		memoryBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
		memoryBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
		memoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
		memoryBarriers.push_back(memoryBarrier);
	}

	vkCmdPipelineBarrier(commandbuffers[currentFrame]->commandbuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, 0, 0, nullptr, memoryBarriers.size(), &memoryBarriers[0], 0, nullptr);